In [26]:
from PIL import Image
import io
import numpy as np
from numpy.dtypes import StrDType
from matplotlib import pyplot as plt
from package.utils.logger import logger

from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from scipy.stats import multivariate_normal
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, precision_score

import pandas as pd
from IPython.display import display, HTML

In [3]:

# Bootstrap
raw_dataset = np.load('.ds.tiny/dataset.zip')

dataset = {
    'train': {
        'data': [],
        'names': [],
        'labels': [],
        'unique_labels': [],
    },
    'valid': {
        'data': [],
        'names': [],
        'labels': [],
        'unique_labels': [],
    }
}

images_resize_shape = (128,128)

# For each image we have the path from which we extract the name and the label of the image
for dsKey in raw_dataset.keys():
    splittedKey = dsKey.split('/')

    img_type = splittedKey[2]
    img_label = splittedKey[3]
    img_name = splittedKey[4]
    
    img = Image.open(io.BytesIO(raw_dataset[dsKey])).resize(images_resize_shape, Image.Resampling.LANCZOS)
    img_array = np.asarray(img).reshape(images_resize_shape[0]*images_resize_shape[1], 3)
    
    dataset[img_type]['data'].append(img_array)
    dataset[img_type]['names'].append(img_name)
    dataset[img_type]['labels'].append(img_label)

for img_type in dataset.keys():
    dataset[img_type]['data'] = np.asarray(dataset[img_type]['data'])
    dataset[img_type]['names'] = np.asarray(dataset[img_type]['names'])

    dataset[img_type]['unique_labels'], dataset[img_type]['labels'] = np.unique(np.asarray(dataset[img_type]['labels']), return_inverse=True)

    logger.info([f'data shape({img_type})', dataset[img_type]['data'].shape])
    logger.info([f'data labels({img_type})', dataset[img_type]['labels'].shape])
    logger.info([f'data unique labels({img_type})', dataset[img_type]['unique_labels']])


DEFAULT_LOGGER: 2024-06-03 22:48:57,726 | INFO | 2263878544.py:42 ['data shape(train)', (1500, 16384, 3)]
DEFAULT_LOGGER: 2024-06-03 22:48:57,726 | INFO | 2263878544.py:43 ['data labels(train)', (1500,)]
DEFAULT_LOGGER: 2024-06-03 22:48:57,727 | INFO | 2263878544.py:44 ['data unique labels(train)', array(['apple_pie', 'bibimbap', 'cannoli', 'edamame', 'falafel',
       'french_toast', 'ice_cream', 'ramen', 'sushi', 'tiramisu'],
      dtype='<U12')]
DEFAULT_LOGGER: 2024-06-03 22:48:57,731 | INFO | 2263878544.py:42 ['data shape(valid)', (500, 16384, 3)]
DEFAULT_LOGGER: 2024-06-03 22:48:57,732 | INFO | 2263878544.py:43 ['data labels(valid)', (500,)]
DEFAULT_LOGGER: 2024-06-03 22:48:57,732 | INFO | 2263878544.py:44 ['data unique labels(valid)', array(['apple_pie', 'bibimbap', 'cannoli', 'edamame', 'falafel',
       'french_toast', 'ice_cream', 'ramen', 'sushi', 'tiramisu'],
      dtype='<U12')]


In [21]:
# Dimensionality reduction

n_components_to_test = {
    'PCA': [3, 10, 50, 100, 200, 500],
    'LDA': [3, 5, 7, 9]    
}

PCAs_instances = {
    'original': {},
    'grayscale': {}
}

LDAs_instances = {
    'original': {},
    'grayscale': {}
}

PCAs_results = {
    'train': {
        'original': {},
        'grayscale': {}
    },
    'valid': {
        'original': {},
        'grayscale': {}
    },
}

LDAs_results = {
    'train': {
        'original': {},
        'grayscale': {}
    },
    'valid': {
        'original': {},
        'grayscale': {}
    },
}

# Grayscaled data
grayscale_train_images = np.mean(dataset['train']['data'], axis=2)
grayscale_valid_images = np.mean(dataset['valid']['data'], axis=2)

for n_components in n_components_to_test['PCA']:
    PCAs_instances['original'][n_components] = []
    PCAs_instances['grayscale'][n_components] = []

    PCAs_results['train']['original'][n_components] = []
    PCAs_results['train']['grayscale'][n_components] = []
    PCAs_results['valid']['original'][n_components] = []
    PCAs_results['valid']['grayscale'][n_components] = []

    PCA_original = [
        make_pipeline(
            StandardScaler(),
            PCA(n_components=n_components)
        ),
        make_pipeline(
            StandardScaler(),
            PCA(n_components=n_components)
        ),
        make_pipeline(
            StandardScaler(),
            PCA(n_components=n_components)
        )
    ]
    
    PCA_grayscale = make_pipeline(
        StandardScaler(),
        PCA(n_components=n_components)
    )

    # Multichannel section
    for i in range(3):
        PCA_original[i].fit(dataset['train']['data'][:,:,i]) 
        PCAs_results['train']['original'][n_components].append(PCA_original[i].transform(dataset['train']['data'][:,:,i]))
        PCAs_results['valid']['original'][n_components].append(PCA_original[i].transform(dataset['valid']['data'][:,:,i]))
        logger.info([f'PCA (channel {i}, {n_components} components): explained_variance_ratio sum', np.sum(PCA_original[i][1].explained_variance_ratio_, axis=0)])

    PCAs_instances['original'][n_components].append(PCA_original)

    # Grayscale section
    PCA_grayscale.fit(grayscale_train_images)

    PCAs_results['train']['grayscale'][n_components] = PCA_grayscale.transform(grayscale_train_images)
    PCAs_results['valid']['grayscale'][n_components] = PCA_grayscale.transform(grayscale_valid_images)

    logger.info([f'PCA (grayscale image, {n_components} components): explained_variance_ratio sum', np.sum(PCA_grayscale[1].explained_variance_ratio_, axis=0)])

for n_components in n_components_to_test['LDA']:
    LDAs_instances['original'][n_components] = []
    LDAs_instances['grayscale'][n_components] = []

    LDAs_results['train']['original'][n_components] = []
    LDAs_results['train']['grayscale'][n_components] = []
    LDAs_results['valid']['original'][n_components] = []
    LDAs_results['valid']['grayscale'][n_components] = []

    LDA_original = [
        make_pipeline(
            StandardScaler(),
            LinearDiscriminantAnalysis(n_components=n_components)
        ),
        make_pipeline(
            StandardScaler(),
            LinearDiscriminantAnalysis(n_components=n_components)
        ),
        make_pipeline(
            StandardScaler(),
            LinearDiscriminantAnalysis(n_components=n_components)
        )
    ]
    
    LDA_grayscale = make_pipeline(
        StandardScaler(),
        LinearDiscriminantAnalysis(n_components=n_components)
    )


    # Multichannel section
    for i in range(3):
        LDA_original[i].fit(dataset['train']['data'][:,:,i], dataset['train']['labels']) 
        LDAs_results['train']['original'][n_components].append(LDA_original[i].transform(dataset['train']['data'][:,:,i]))
        LDAs_results['valid']['original'][n_components].append(LDA_original[i].transform(dataset['valid']['data'][:,:,i]))
        logger.info([f'LDA (channel {i}, {n_components} components): explained_variance_ratio sum', np.sum(LDA_original[i][1].explained_variance_ratio_, axis=0)])

    LDAs_instances['original'][n_components].append(LDA_original)

    # Grayscale section
    LDA_grayscale.fit(grayscale_train_images,dataset['train']['labels'])

    
    LDAs_results['train']['grayscale'][n_components] = LDA_grayscale.transform(grayscale_train_images)
    LDAs_results['valid']['grayscale'][n_components] = LDA_grayscale.transform(grayscale_valid_images)

    logger.info([f'LDA (grayscale image, {n_components} components): explained_variance_ratio sum', np.sum(LDA_grayscale[1].explained_variance_ratio_, axis=0)])


DEFAULT_LOGGER: 2024-06-03 23:31:09,064 | INFO | 3103018143.py:78 ['PCA (channel 0, 3 components): explained_variance_ratio sum', 0.38574342110284787]
DEFAULT_LOGGER: 2024-06-03 23:31:10,699 | INFO | 3103018143.py:78 ['PCA (channel 1, 3 components): explained_variance_ratio sum', 0.34405484140887604]
DEFAULT_LOGGER: 2024-06-03 23:31:12,224 | INFO | 3103018143.py:78 ['PCA (channel 2, 3 components): explained_variance_ratio sum', 0.3741552504142015]
DEFAULT_LOGGER: 2024-06-03 23:31:13,887 | INFO | 3103018143.py:88 ['PCA (grayscale image, 3 components): explained_variance_ratio sum', 0.34958271805630514]
DEFAULT_LOGGER: 2024-06-03 23:31:15,702 | INFO | 3103018143.py:78 ['PCA (channel 0, 10 components): explained_variance_ratio sum', 0.5580099266581422]
DEFAULT_LOGGER: 2024-06-03 23:31:17,189 | INFO | 3103018143.py:78 ['PCA (channel 1, 10 components): explained_variance_ratio sum', 0.519641371803363]
DEFAULT_LOGGER: 2024-06-03 23:31:18,777 | INFO | 3103018143.py:78 ['PCA (channel 2, 10 com

In [49]:
# Classification - KNN

k_to_test = {
    'PCA': [3, 5, 9, 15, 21, 27],
    'LDA': [3, 5, 9, 15, 21, 27]
}

KNN_PCA_stats = []
KNN_LDA_stats = []

for k_idx, k in enumerate(k_to_test['PCA']):

    KNN_PCA_stats.insert(k_idx,[k])

    for n_components_idx, n_components in enumerate(n_components_to_test['PCA']):

        knn = KNeighborsClassifier(k)

        knn.fit(PCAs_results['valid']['grayscale'][n_components], dataset['valid']['labels'])
        preds = knn.predict(PCAs_results['valid']['grayscale'][n_components])

        accuracy = round(accuracy_score(dataset['valid']['labels'], preds), 3)
        precision = round(precision_score(dataset['valid']['labels'], preds, average='macro'),3)
        
        # logger.info([f"KNN on PCA (grayscale images, k = {k}, {n_components} components)", "accuracy", accuracy, "precision", precision])

        KNN_PCA_stats[k_idx].insert(n_components_idx + 1,(accuracy, precision))


for k_idx,k in enumerate(k_to_test['LDA']):
    
    KNN_LDA_stats.insert(k_idx,[k])
    
    for n_components_idx, n_components in enumerate(n_components_to_test['LDA']):
        knn = KNeighborsClassifier(k)
        # TODO: this is not ok...same data should return same classes
        knn.fit(LDAs_results['valid']['grayscale'][n_components], dataset['valid']['labels'])
        preds = knn.predict(LDAs_results['valid']['grayscale'][n_components])

        accuracy = round(accuracy_score(dataset['valid']['labels'], preds), 3)
        precision = round(precision_score(dataset['valid']['labels'], preds, average='macro'),3)

        # logger.info([f"KNN on LDA (grayscale images, k = {k}, {n_components} components)", "accuracy", accuracy, "precision", precision])

        KNN_LDA_stats[k_idx].insert(n_components_idx + 1,(accuracy, precision))

KNN_PCA_df = pd.DataFrame(KNN_PCA_stats, columns=['k\\PCA components'] + n_components_to_test['PCA'])
KNN_LDA_df = pd.DataFrame(KNN_LDA_stats, columns=['k\\LDA components'] + n_components_to_test['LDA'])

display(KNN_PCA_df)
display(KNN_LDA_df)


Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7f3a989ca0e0>
Traceback (most recent call last):
  File "/home/enrico/anaconda3/envs/uni/lib/python3.10/site-packages/threadpoolctl.py", line 400, in match_module_callback
    if backend not in self.loaded_backends:
  File "/home/enrico/anaconda3/envs/uni/lib/python3.10/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    
  File "/home/enrico/anaconda3/envs/uni/lib/python3.10/site-packages/threadpoolctl.py", line 606, in __init__
    ):
  File "/home/enrico/anaconda3/envs/uni/lib/python3.10/site-packages/threadpoolctl.py", line 646, in get_version
    return num_threads
AttributeError: 'NoneType' object has no attribute 'split'
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7f3a84cdbc70>
Traceback (most recent ca

Unnamed: 0,k\PCA components,3,10,50,100,200,500
0,3,"(0.448, 0.5)","(0.486, 0.549)","(0.462, 0.596)","(0.438, 0.569)","(0.43, 0.574)","(0.41, 0.599)"
1,5,"(0.368, 0.38)","(0.426, 0.457)","(0.384, 0.472)","(0.366, 0.451)","(0.368, 0.47)","(0.362, 0.515)"
2,9,"(0.314, 0.312)","(0.368, 0.373)","(0.324, 0.37)","(0.312, 0.39)","(0.328, 0.472)","(0.312, 0.362)"
3,15,"(0.27, 0.266)","(0.342, 0.347)","(0.3, 0.36)","(0.28, 0.334)","(0.26, 0.307)","(0.26, 0.327)"
4,21,"(0.26, 0.254)","(0.314, 0.319)","(0.306, 0.36)","(0.278, 0.335)","(0.292, 0.352)","(0.278, 0.348)"
5,27,"(0.26, 0.258)","(0.302, 0.306)","(0.284, 0.33)","(0.284, 0.379)","(0.278, 0.326)","(0.262, 0.31)"


Unnamed: 0,k\LDA components,3,5,7,9
0,3,"(0.452, 0.524)","(0.438, 0.499)","(0.448, 0.536)","(0.45, 0.521)"
1,5,"(0.37, 0.401)","(0.378, 0.398)","(0.386, 0.417)","(0.384, 0.427)"
2,9,"(0.3, 0.309)","(0.302, 0.32)","(0.284, 0.306)","(0.3, 0.321)"
3,15,"(0.258, 0.253)","(0.264, 0.267)","(0.236, 0.237)","(0.248, 0.255)"
4,21,"(0.268, 0.269)","(0.224, 0.219)","(0.216, 0.22)","(0.24, 0.245)"
5,27,"(0.244, 0.235)","(0.212, 0.218)","(0.214, 0.211)","(0.208, 0.22)"
