In [25]:
from PIL import Image,ImageOps
import io
import numpy as np
from matplotlib import pyplot as plt
from package.utils.logger import logger
import torch

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.mixture import GaussianMixture
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.kernel_ridge import KernelRidge

from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

from torch.utils.data import DataLoader
import torchvision.models as models
from torchvision import transforms
from torchvision.models import VGG16_Weights

import pickle

from sklearn.metrics import accuracy_score, precision_score, adjusted_rand_score

import pandas as pd
from IPython.display import display

# Initialing compute device (use GPU if available).
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [2]:

# Bootstrap
raw_dataset = np.load('.ds.tiny/dataset.zip')

dataset = {
    'train': {
        'data': [],
        'names': [],
        'labels': [],
        'unique_labels': [],
    },
    'valid': {
        'data': [],
        'names': [],
        'labels': [],
        'unique_labels': [],
    }
}

images_shape = (224,224)

# For each image we have the path from which we extract the name and the label of the image
for dsKey in raw_dataset.keys():
    splittedKey = dsKey.split('/')

    img_type = splittedKey[2]
    img_label = splittedKey[3]
    img_name = splittedKey[4]
    
    img = Image.open(io.BytesIO(raw_dataset[dsKey]))
    img = ImageOps.fit(img,images_shape, Image.Resampling.LANCZOS).convert('RGB')
    
    img_array = np.asarray(img)#.reshape(images_shape[0]*images_shape[1], 3)
    
    dataset[img_type]['data'].append(img_array)
    dataset[img_type]['names'].append(img_name)
    dataset[img_type]['labels'].append(img_label)

for img_type in dataset.keys():
    dataset[img_type]['data'] = np.asarray(dataset[img_type]['data'])
    dataset[img_type]['names'] = np.asarray(dataset[img_type]['names'])

    dataset[img_type]['unique_labels'], dataset[img_type]['labels'] = np.unique(np.asarray(dataset[img_type]['labels']), return_inverse=True)

    logger.info([f'data shape({img_type})', dataset[img_type]['data'].shape])
    logger.info([f'data labels({img_type})', dataset[img_type]['labels'].shape])
    logger.info([f'data unique labels({img_type})', dataset[img_type]['unique_labels']])


DEFAULT_LOGGER: 2024-06-14 16:17:48,671 | INFO | 1276424662.py:44 ['data shape(train)', (1500, 224, 224, 3)]
DEFAULT_LOGGER: 2024-06-14 16:17:48,672 | INFO | 1276424662.py:45 ['data labels(train)', (1500,)]
DEFAULT_LOGGER: 2024-06-14 16:17:48,673 | INFO | 1276424662.py:46 ['data unique labels(train)', array(['apple_pie', 'bibimbap', 'cannoli', 'edamame', 'falafel',
       'french_toast', 'ice_cream', 'ramen', 'sushi', 'tiramisu'],
      dtype='<U12')]
DEFAULT_LOGGER: 2024-06-14 16:17:48,681 | INFO | 1276424662.py:44 ['data shape(valid)', (500, 224, 224, 3)]
DEFAULT_LOGGER: 2024-06-14 16:17:48,682 | INFO | 1276424662.py:45 ['data labels(valid)', (500,)]
DEFAULT_LOGGER: 2024-06-14 16:17:48,682 | INFO | 1276424662.py:46 ['data unique labels(valid)', array(['apple_pie', 'bibimbap', 'cannoli', 'edamame', 'falafel',
       'french_toast', 'ice_cream', 'ramen', 'sushi', 'tiramisu'],
      dtype='<U12')]


In [39]:
normalization_std = [0.229, 0.224, 0.225]
normalization_mean = [0.485, 0.456, 0.406]



loader = transforms.Compose([
    transforms.ToTensor(),
    # transforms.RandomResizedCrop(224),
    # transforms.Normalize(mean=normalization_mean, std=normalization_std)
])

vgg_out = {
    'train': [],
    'valid': []
}

# Initialize the model.
model = models.vgg16(weights=VGG16_Weights.DEFAULT).features.to(device)

for img_type in dataset.keys():
    vgg_out[img_type] = []


    loaded_images = DataLoader(dataset[img_type]['data'], batch_size=dataset[img_type]['data'].shape[0])

    res = model(loader(next(iter(loaded_images))))

    print(res)
    
    for image_idx in range(dataset[img_type]['data'].shape[0]):
        loaded_image = loader(dataset[img_type]['data'][image_idx, :]).unsqueeze(0).to(device)

        res = model(loaded_image)
        features = res.data.detach().cpu().numpy().flatten()
        print(f"Extracting feature: {image_idx}/{dataset[img_type]['data'].shape[0]}")

        vgg_out[img_type].append(features)
    
    vgg_out[img_type] = np.asarray(vgg_out[img_type])
    print(vgg_out[img_type].shape)

pickle.dump(vgg_out, open( "vgg_out.pkl", "wb"))

TypeError: pic should be PIL Image or ndarray. Got <class 'torch.Tensor'>

In [12]:
# preload env 
vgg_out = pickle.load(open( "vgg_out.pkl", "rb" ))

# Dimensionality reduction

n_components_to_test = {
    'PCA': [120, 800],#[3, 10, 50, 100, 200, 500, 1200],
    'LDA': [5, 7],#[3, 5, 7, 9]    
    'TSNE': [2,3]
}

PCAs_instances = {}
LDAs_instances = {}
TSNEs_instances = {}

PCAs_results = {
    'train': {},
    'valid': {},
}

LDAs_results = {
    'train': {},
    'valid': {},
}

TSNEs_results = {
    'train': {},
    'valid': {},
}

for n_components in n_components_to_test['PCA']:
    PCAs_instances[n_components] = []

    PCAs_results['train'][n_components] = []
    PCAs_results['valid'][n_components] = []

    PCA_instance = PCA(n_components=n_components)
    
    PCA_instance.fit(vgg_out['train'])

    PCAs_results['train'][n_components] = PCA_instance.transform(vgg_out['train'])
    PCAs_results['valid'][n_components] = PCA_instance.transform(vgg_out['valid']) 

    PCAs_instances[n_components] = PCA_instance

    logger.info([f'PCA ({n_components} components): explained_variance_ratio sum', np.sum(PCA_instance.explained_variance_ratio_,axis=0)])


for n_components in n_components_to_test['LDA']:

    LDAs_instances[n_components] = []

    LDAs_results['train'][n_components] = []
    LDAs_results['valid'][n_components] = []

    LDA_instance = LinearDiscriminantAnalysis(n_components=n_components)
    
    LDA_instance.fit(vgg_out['train'], dataset['train']['labels'])

    LDAs_results['train'][n_components] = LDA_instance.transform(vgg_out['train'])
    LDAs_results['valid'][n_components] = LDA_instance.transform(vgg_out['valid']) 

    LDAs_instances[n_components] = LDA_instance

    logger.info([f'LDA ({n_components} components): explained_variance_ratio sum', np.sum(LDA_instance.explained_variance_ratio_,axis=0)])

for n_components in n_components_to_test['TSNE']:

    TSNEs_instances[n_components] = []

    TSNEs_results['train'][n_components] = []
    #TSNEs_results['valid'][n_components] = []

    TSNE_instance_train =  TSNE(n_components=n_components, verbose=1, n_iter=3000)
    TSNE_instance_valid =  TSNE(n_components=n_components, verbose=1, n_iter=3000)

    #TSNEs_results['train'][n_components] = TSNE_instance_train.fit_transform(vgg_out['train'])
    #TSNEs_results['valid'][n_components] = TSNE_instance_valid.fit_transform(vgg_out['valid'])

    TSNEs_results['train'][n_components] = TSNE_instance_train.fit_transform(LDAs_results['train'][7])
    TSNEs_results['valid'][n_components] = TSNE_instance_valid.fit_transform(LDAs_results['valid'][7])

    TSNEs_instances[n_components] = [TSNE_instance_train, TSNE_instance_valid]


    # logger.info([f'TSNE ({n_components} components): explained_variance_ratio sum', np.sum(TSNE_instance[1].explained_variance_ratio_,axis=0)])




DEFAULT_LOGGER: 2024-06-14 16:29:44,258 | INFO | 3293090023.py:46 ['PCA (120 components): explained_variance_ratio sum', 0.41986844]
DEFAULT_LOGGER: 2024-06-14 16:29:51,186 | INFO | 3293090023.py:46 ['PCA (800 components): explained_variance_ratio sum', 0.8584969]
DEFAULT_LOGGER: 2024-06-14 16:30:08,156 | INFO | 3293090023.py:65 ['LDA (5 components): explained_variance_ratio sum', 0.78163844]
DEFAULT_LOGGER: 2024-06-14 16:30:25,789 | INFO | 3293090023.py:65 ['LDA (7 components): explained_variance_ratio sum', 0.90211105]
[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 1500 samples in 0.002s...
[t-SNE] Computed neighbors for 1500 samples in 0.073s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1500
[t-SNE] Computed conditional probabilities for sample 1500 / 1500
[t-SNE] Mean sigma: 0.862684
[t-SNE] KL divergence after 250 iterations with early exaggeration: 62.166275
[t-SNE] KL divergence after 3000 iterations: 0.914307
[t-SNE] Computing 91 nearest neighbors.

In [19]:
fig_2d = plt.figure()
ax = fig_2d.add_subplot()

for i in range(len(dataset['train']['unique_labels'])):
    classIdxs = dataset['train']['labels'] == i

    tsne_features = TSNEs_results['train'][2][classIdxs,:]

    print(tsne_features.shape)

    ax.set_label(dataset['train']['unique_labels'][i])
    ax.scatter(tsne_features[:,0], tsne_features[:,1], marker='.', label=dataset['train']['unique_labels'][i])
plt.legend(loc="upper left")

fig_3d = plt.figure()
ax = fig_3d.add_subplot(projection='3d')
for i in range(len(dataset['train']['unique_labels'])):
    classIdxs = dataset['train']['labels'] == i

    tsne_features = TSNEs_results['train'][3][classIdxs,:]
    
    ax.scatter(tsne_features[:,0], tsne_features[:,1], tsne_features[:,2], marker='.', label=dataset['train']['unique_labels'][i])

plt.legend(loc="upper left")
plt.show()

(150, 2)
(150, 2)
(150, 2)
(150, 2)
(150, 2)
(150, 2)
(150, 2)
(150, 2)
(150, 2)
(150, 2)


In [20]:
# Classification - KNN

k_to_test = {
    'VGG': [3, 5, 9, 15, 21, 55, 111, 251],
    'PCA': [3, 5, 9, 15, 21, 55, 111, 251],
    'LDA': [3, 5, 9, 15, 21, 55, 111, 251]
}

KNN_VGG_stats = []
KNN_PCA_stats = []
KNN_LDA_stats = []

for k_idx, k in enumerate(k_to_test['VGG']):

    KNN_VGG_stats.insert(k_idx,[k])

    knn = KNeighborsClassifier(k)

    knn.fit(vgg_out['train'], dataset['train']['labels'])
    preds = knn.predict(vgg_out['valid'])

    accuracy = round(accuracy_score(dataset['valid']['labels'], preds), 3)
    precision = round(precision_score(dataset['valid']['labels'], preds, average='macro'),3)
    
    KNN_VGG_stats[k_idx].append((accuracy, precision))

    #ConfusionMatrixDisplay(confusion_matrix(dataset['valid']['labels'], preds),display_labels=dataset['valid']['unique_labels']).plot()

KNN_VGG_df = pd.DataFrame(KNN_VGG_stats, columns=['k\\VGG', ''])
display(KNN_VGG_df)

for k_idx, k in enumerate(k_to_test['PCA']):

    KNN_PCA_stats.insert(k_idx,[k])

    for n_components_idx, n_components in enumerate(n_components_to_test['PCA']):
        knn = OneVsOneClassifier(KNeighborsClassifier(k))

        knn.fit(PCAs_results['train'][n_components], dataset['train']['labels'])
        preds = knn.predict(PCAs_results['valid'][n_components])

        accuracy = round(accuracy_score(dataset['valid']['labels'], preds), 3)
        precision = round(precision_score(dataset['valid']['labels'], preds, average='macro'),3)
        
        KNN_PCA_stats[k_idx].insert(n_components_idx + 1,(accuracy, precision))

        #ConfusionMatrixDisplay(confusion_matrix(dataset['valid']['labels'], preds),display_labels=dataset['valid']['unique_labels']).plot()

KNN_PCA_df = pd.DataFrame(KNN_PCA_stats, columns=['k\\PCA components'] + n_components_to_test['PCA'])
display(KNN_PCA_df)

for k_idx,k in enumerate(k_to_test['LDA']):
    
    KNN_LDA_stats.insert(k_idx,[k])
    
    for n_components_idx, n_components in enumerate(n_components_to_test['LDA']):
        knn = OneVsOneClassifier( KNeighborsClassifier(k))
        
        knn.fit(LDAs_results['train'][n_components], dataset['train']['labels'])
        preds = knn.predict(LDAs_results['valid'][n_components])

        accuracy = round(accuracy_score(dataset['valid']['labels'], preds), 3)
        precision = round(precision_score(dataset['valid']['labels'], preds, average='macro'),3)

        KNN_LDA_stats[k_idx].insert(n_components_idx + 1,(accuracy, precision))
        # ConfusionMatrixDisplay(confusion_matrix(dataset['valid']['labels'], preds),display_labels=dataset['valid']['unique_labels']).plot()


KNN_LDA_df = pd.DataFrame(KNN_LDA_stats, columns=['k\\LDA components'] + n_components_to_test['LDA'])
display(KNN_LDA_df)

plt.show()

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,k\VGG,Unnamed: 2
0,3,"(0.276, 0.521)"
1,5,"(0.318, 0.404)"
2,9,"(0.334, 0.444)"
3,15,"(0.344, 0.487)"
4,21,"(0.322, 0.502)"
5,55,"(0.264, 0.519)"
6,111,"(0.23, 0.643)"
7,251,"(0.23, 0.325)"


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,k\PCA components,120,800
0,3,"(0.544, 0.564)","(0.358, 0.444)"
1,5,"(0.554, 0.589)","(0.346, 0.471)"
2,9,"(0.548, 0.616)","(0.334, 0.49)"
3,15,"(0.546, 0.638)","(0.316, 0.568)"
4,21,"(0.512, 0.634)","(0.29, 0.591)"
5,55,"(0.42, 0.619)","(0.212, 0.445)"
6,111,"(0.358, 0.632)","(0.212, 0.316)"
7,251,"(0.234, 0.367)","(0.222, 0.241)"


Unnamed: 0,k\LDA components,5,7
0,3,"(0.496, 0.526)","(0.596, 0.615)"
1,5,"(0.52, 0.572)","(0.628, 0.649)"
2,9,"(0.538, 0.596)","(0.642, 0.667)"
3,15,"(0.548, 0.62)","(0.628, 0.656)"
4,21,"(0.55, 0.617)","(0.632, 0.665)"
5,55,"(0.544, 0.632)","(0.642, 0.685)"
6,111,"(0.536, 0.641)","(0.62, 0.69)"
7,251,"(0.502, 0.617)","(0.596, 0.69)"


In [21]:
# Classification - SVM

kernels_to_test = {
    'PCA': ['linear', 'poly', 'sigmoid'],#['linear', 'poly', 'sigmoid'],
    'LDA': ['linear', 'poly', 'sigmoid'],#['linear', 'poly', 'sigmoid'],
    'TSNE': ['linear', 'poly', 'sigmoid']#['linear', 'poly', 'sigmoid'],
}

SVM_PCA_stats = []
SVM_LDA_stats = []
SVM_TSNE_stats = []

for kernel_idx,kernel in enumerate(kernels_to_test['PCA']):

    SVM_PCA_stats.insert(kernel_idx,[kernel])
    
    for n_components_idx, n_components in enumerate(n_components_to_test['PCA']):#n_components_to_test['PCA']):
        svm = OneVsOneClassifier(SVC(kernel=kernel))

        svm.fit(PCAs_results['train'][n_components], dataset['train']['labels'])

        preds = svm.predict(PCAs_results['valid'][n_components])

        accuracy = round(accuracy_score(dataset['valid']['labels'], preds), 3)
        precision = round(precision_score(dataset['valid']['labels'], preds, average='macro'),3)

        SVM_PCA_stats[kernel_idx].insert(n_components_idx + 1,(accuracy, precision))

SVM_PCA_df = pd.DataFrame(SVM_PCA_stats, columns=['kernel\\PCA components'] + n_components_to_test['PCA'])
display(SVM_PCA_df)

for kernel_idx,kernel in enumerate(kernels_to_test['LDA']):

    SVM_LDA_stats.insert(kernel_idx,[kernel])
    
    for n_components_idx, n_components in enumerate(n_components_to_test['LDA']):
        svm = OneVsOneClassifier(SVC(kernel=kernel))

        svm.fit(LDAs_results['train'][n_components], dataset['train']['labels'])

        preds = svm.predict(LDAs_results['valid'][n_components])

        accuracy = round(accuracy_score(dataset['valid']['labels'], preds), 3)
        precision = round(precision_score(dataset['valid']['labels'], preds, average='macro'),3)

        SVM_LDA_stats[kernel_idx].insert(n_components_idx + 1,(accuracy, precision))

SVM_LDA_df = pd.DataFrame(SVM_LDA_stats, columns=['kernel\\LDA components'] + n_components_to_test['LDA'])
display(SVM_LDA_df)

for kernel_idx,kernel in enumerate(kernels_to_test['TSNE']):

    SVM_TSNE_stats.insert(kernel_idx,[kernel])
    
    for n_components_idx, n_components in enumerate(n_components_to_test['TSNE']):
        svm = OneVsOneClassifier(SVC(kernel=kernel))

        svm.fit(TSNEs_results['train'][n_components], dataset['train']['labels'])

        preds = svm.predict(TSNEs_results['valid'][n_components])

        accuracy = round(accuracy_score(dataset['valid']['labels'], preds), 3)
        precision = round(precision_score(dataset['valid']['labels'], preds, average='macro'),3)

        SVM_TSNE_stats[kernel_idx].insert(n_components_idx + 1,(accuracy, precision))

SVM_TSNE_df = pd.DataFrame(SVM_TSNE_stats, columns=['kernel\\TSNE components'] + n_components_to_test['TSNE'])
display(SVM_TSNE_df)

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,kernel\PCA components,120,800
0,linear,"(0.678, 0.69)","(0.722, 0.727)"
1,poly,"(0.428, 0.699)","(0.198, 0.728)"
2,sigmoid,"(0.714, 0.723)","(0.728, 0.735)"


Unnamed: 0,kernel\LDA components,5,7
0,linear,"(0.512, 0.562)","(0.59, 0.627)"
1,poly,"(0.446, 0.655)","(0.466, 0.727)"
2,sigmoid,"(0.496, 0.498)","(0.616, 0.616)"


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,kernel\TSNE components,2,3
0,linear,"(0.038, 0.02)","(0.07, 0.044)"
1,poly,"(0.08, 0.016)","(0.078, 0.026)"
2,sigmoid,"(0.036, 0.032)","(0.032, 0.035)"


In [22]:
# Classification - SGD

losses_to_test = {
    'PCA': ['modified_huber', 'log_loss', 'hinge'],
    'LDA': ['modified_huber', 'log_loss', 'hinge'],#['modified_huber', 'log_loss', 'hinge']
}

SGD_PCA_grayscale_stats = []
SGD_LDA_grayscale_stats = []

for loss_idx,loss in enumerate(losses_to_test['PCA']):

    SGD_PCA_grayscale_stats.insert(loss_idx,[loss])
    
    for n_components_idx, n_components in enumerate(n_components_to_test['PCA']):#n_components_to_test['PCA']):
        svm = OneVsOneClassifier(SGDClassifier(loss=loss, max_iter=10000 ))

        svm.fit(PCAs_results['train'][n_components], dataset['train']['labels'])

        preds = svm.predict(PCAs_results['valid'][n_components])

        accuracy = round(accuracy_score(dataset['valid']['labels'], preds), 3)
        precision = round(precision_score(dataset['valid']['labels'], preds, average='macro'),3)

        SGD_PCA_grayscale_stats[loss_idx].insert(n_components_idx + 1,(accuracy, precision))
        # ConfusionMatrixDisplay(confusion_matrix(dataset['valid']['labels'], preds),display_labels=dataset['valid']['unique_labels']).plot()

SGD_PCA_df = pd.DataFrame(SGD_PCA_grayscale_stats, columns=['loss\\PCA'] + n_components_to_test['PCA'])
display(SGD_PCA_df)

for loss_idx,loss in enumerate(losses_to_test['LDA']):

    SGD_LDA_grayscale_stats.insert(loss_idx,[loss])
    
    for n_components_idx, n_components in enumerate(n_components_to_test['LDA']):
        svm = OneVsOneClassifier(SGDClassifier(loss=loss, max_iter=10000 ))

        svm.fit(LDAs_results['train'][n_components], dataset['train']['labels'])

        preds = svm.predict(LDAs_results['valid'][n_components])

        accuracy = round(accuracy_score(dataset['valid']['labels'], preds), 3)
        precision = round(precision_score(dataset['valid']['labels'], preds, average='macro'),3)

        SGD_LDA_grayscale_stats[loss_idx].insert(n_components_idx + 1,(accuracy, precision))

SGD_LDA_df = pd.DataFrame(SGD_LDA_grayscale_stats, columns=['loss\\LDA'] + n_components_to_test['LDA'])
display(SGD_LDA_df)

Unnamed: 0,loss\PCA,120,800
0,modified_huber,"(0.684, 0.694)","(0.708, 0.712)"
1,log_loss,"(0.676, 0.681)","(0.718, 0.721)"
2,hinge,"(0.694, 0.696)","(0.704, 0.714)"


Unnamed: 0,loss\LDA,5,7
0,modified_huber,"(0.494, 0.515)","(0.528, 0.544)"
1,log_loss,"(0.488, 0.463)","(0.544, 0.565)"
2,hinge,"(0.494, 0.49)","(0.546, 0.584)"


In [None]:
loader = transforms.Compose([
    
])