In [15]:
from PIL import Image,ImageOps
import io
import numpy as np
from matplotlib import pyplot as plt
from package.utils.logger import logger
import torch

from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.mixture import GaussianMixture
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.kernel_ridge import KernelRidge

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

import torchvision.models as models
from torchvision import transforms
from torchvision.models import VGG16_Weights


from sklearn.metrics import accuracy_score, precision_score, adjusted_rand_score

import pandas as pd
from IPython.display import display

In [17]:

# Bootstrap
raw_dataset = np.load('.ds.tiny/dataset.zip')

dataset = {
    'train': {
        'data': [],
        'names': [],
        'labels': [],
        'unique_labels': [],
    },
    'valid': {
        'data': [],
        'names': [],
        'labels': [],
        'unique_labels': [],
    }
}

images_shape = (200,200)

# For each image we have the path from which we extract the name and the label of the image
for dsKey in raw_dataset.keys():
    splittedKey = dsKey.split('/')

    img_type = splittedKey[2]
    img_label = splittedKey[3]
    img_name = splittedKey[4]
    
    img = Image.open(io.BytesIO(raw_dataset[dsKey]))
    img = ImageOps.fit(img,images_shape, Image.Resampling.LANCZOS)
    
    img_array = np.asarray(img).reshape(images_shape[0]*images_shape[1], 3)
    
    dataset[img_type]['data'].append(img_array)
    dataset[img_type]['names'].append(img_name)
    dataset[img_type]['labels'].append(img_label)

for img_type in dataset.keys():
    dataset[img_type]['data'] = np.asarray(dataset[img_type]['data'])
    dataset[img_type]['names'] = np.asarray(dataset[img_type]['names'])

    dataset[img_type]['unique_labels'], dataset[img_type]['labels'] = np.unique(np.asarray(dataset[img_type]['labels']), return_inverse=True)

    logger.info([f'data shape({img_type})', dataset[img_type]['data'].shape])
    logger.info([f'data labels({img_type})', dataset[img_type]['labels'].shape])
    logger.info([f'data unique labels({img_type})', dataset[img_type]['unique_labels']])


DEFAULT_LOGGER: 2024-06-13 17:45:59,304 | INFO | 3365796789.py:44 ['data shape(train)', (1500, 200, 200)]
DEFAULT_LOGGER: 2024-06-13 17:45:59,305 | INFO | 3365796789.py:45 ['data labels(train)', (1500,)]
DEFAULT_LOGGER: 2024-06-13 17:45:59,306 | INFO | 3365796789.py:46 ['data unique labels(train)', array(['apple_pie', 'bibimbap', 'cannoli', 'edamame', 'falafel',
       'french_toast', 'ice_cream', 'ramen', 'sushi', 'tiramisu'],
      dtype='<U12')]
DEFAULT_LOGGER: 2024-06-13 17:45:59,312 | INFO | 3365796789.py:44 ['data shape(valid)', (500, 200, 200)]
DEFAULT_LOGGER: 2024-06-13 17:45:59,313 | INFO | 3365796789.py:45 ['data labels(valid)', (500,)]
DEFAULT_LOGGER: 2024-06-13 17:45:59,314 | INFO | 3365796789.py:46 ['data unique labels(valid)', array(['apple_pie', 'bibimbap', 'cannoli', 'edamame', 'falafel',
       'french_toast', 'ice_cream', 'ramen', 'sushi', 'tiramisu'],
      dtype='<U12')]


In [7]:
# Dimensionality reduction

n_components_to_test = {
    'PCA': [100],#[3, 10, 50, 100, 200, 500, 1200],
    'LDA': [5]#[3, 5, 7, 9]    
}

PCAs_instances = {
    'original': {},
    'grayscale': {}
}

LDAs_instances = {
    'original': {},
    'grayscale': {}
}

PCAs_results = {
    'train': {
        'original': {},
        'grayscale': {}
    },
    'valid': {
        'original': {},
        'grayscale': {}
    },
}

LDAs_results = {
    'train': {
        'original': {},
        'grayscale': {}
    },
    'valid': {
        'original': {},
        'grayscale': {}
    },
}

# Grayscaled data
grayscale_train_images = np.mean(dataset['train']['data'], axis=2)
grayscale_valid_images = np.mean(dataset['valid']['data'], axis=2)

for n_components in n_components_to_test['PCA']:
    PCAs_instances['original'][n_components] = []
    PCAs_instances['grayscale'][n_components] = []

    PCAs_results['train']['original'][n_components] = []
    PCAs_results['train']['grayscale'][n_components] = []
    PCAs_results['valid']['original'][n_components] = []
    PCAs_results['valid']['grayscale'][n_components] = []

    PCA_original = [
        make_pipeline(
            StandardScaler(),
            PCA(n_components=n_components)
        ),
        make_pipeline(
            StandardScaler(),
            PCA(n_components=n_components)
        ),
        make_pipeline(
            StandardScaler(),
            PCA(n_components=n_components)
        )
    ]
    
    PCA_grayscale = make_pipeline(
        StandardScaler(),
        PCA(n_components=n_components)
    )

    # Multichannel section
    for i in range(3):
        PCA_original[i].fit(dataset['train']['data'][:,:,i]) 
        PCAs_results['train']['original'][n_components].append(PCA_original[i].transform(dataset['train']['data'][:,:,i]))
        PCAs_results['valid']['original'][n_components].append(PCA_original[i].transform(dataset['valid']['data'][:,:,i]))
        logger.info([f'PCA (channel {i}, {n_components} components): explained_variance_ratio sum', np.sum(PCA_original[i][1].explained_variance_ratio_, axis=0)])

    PCAs_instances['original'][n_components].append(PCA_original)

    # Grayscale section
    PCA_grayscale.fit(grayscale_train_images)

    PCAs_results['train']['grayscale'][n_components] = PCA_grayscale.transform(grayscale_train_images)
    PCAs_results['valid']['grayscale'][n_components] = PCA_grayscale.transform(grayscale_valid_images)

    logger.info([f'PCA (grayscale image, {n_components} components): explained_variance_ratio sum', np.sum(PCA_grayscale[1].explained_variance_ratio_, axis=0)])

for n_components in n_components_to_test['LDA']:
    LDAs_instances['original'][n_components] = []
    LDAs_instances['grayscale'][n_components] = []

    LDAs_results['train']['original'][n_components] = []
    LDAs_results['train']['grayscale'][n_components] = []
    LDAs_results['valid']['original'][n_components] = []
    LDAs_results['valid']['grayscale'][n_components] = []

    LDA_original = [
        make_pipeline(
            StandardScaler(),
            LinearDiscriminantAnalysis(n_components=n_components)
        ),
        make_pipeline(
            StandardScaler(),
            LinearDiscriminantAnalysis(n_components=n_components)
        ),
        make_pipeline(
            StandardScaler(),
            LinearDiscriminantAnalysis(n_components=n_components)
        )
    ]
    
    LDA_grayscale = make_pipeline(
        StandardScaler(),
        LinearDiscriminantAnalysis(n_components=n_components)
    )


    # Multichannel section
    for i in range(3):
        LDA_original[i].fit(dataset['train']['data'][:,:,i], dataset['train']['labels']) 
        LDAs_results['train']['original'][n_components].append(LDA_original[i].transform(dataset['train']['data'][:,:,i]))
        LDAs_results['valid']['original'][n_components].append(LDA_original[i].transform(dataset['valid']['data'][:,:,i]))
        logger.info([f'LDA (channel {i}, {n_components} components): explained_variance_ratio sum', np.sum(LDA_original[i][1].explained_variance_ratio_, axis=0)])

    LDAs_instances['original'][n_components].append(LDA_original)

    # Grayscale section
    LDA_grayscale.fit(grayscale_train_images,dataset['train']['labels'])

    
    LDAs_results['train']['grayscale'][n_components] = LDA_grayscale.transform(grayscale_train_images)
    LDAs_results['valid']['grayscale'][n_components] = LDA_grayscale.transform(grayscale_valid_images)

    logger.info([f'LDA (grayscale image, {n_components} components): explained_variance_ratio sum', np.sum(LDA_grayscale[1].explained_variance_ratio_, axis=0)])


DEFAULT_LOGGER: 2024-06-13 17:36:06,275 | INFO | 3373128535.py:78 ['PCA (channel 0, 100 components): explained_variance_ratio sum', 0.8325306841032433]
DEFAULT_LOGGER: 2024-06-13 17:36:09,367 | INFO | 3373128535.py:78 ['PCA (channel 1, 100 components): explained_variance_ratio sum', 0.813131158115078]
DEFAULT_LOGGER: 2024-06-13 17:36:12,237 | INFO | 3373128535.py:78 ['PCA (channel 2, 100 components): explained_variance_ratio sum', 0.8214282563310229]
DEFAULT_LOGGER: 2024-06-13 17:36:15,068 | INFO | 3373128535.py:88 ['PCA (grayscale image, 100 components): explained_variance_ratio sum', 0.8158022643597131]
DEFAULT_LOGGER: 2024-06-13 17:36:39,691 | INFO | 3373128535.py:125 ['LDA (channel 0, 5 components): explained_variance_ratio sum', 0.6765554309681415]
DEFAULT_LOGGER: 2024-06-13 17:37:05,399 | INFO | 3373128535.py:125 ['LDA (channel 1, 5 components): explained_variance_ratio sum', 0.7566165106333242]
DEFAULT_LOGGER: 2024-06-13 17:37:29,869 | INFO | 3373128535.py:125 ['LDA (channel 2, 

In [14]:
# Classification - KNN

k_to_test = {
    'PCA': [3, 5, 9, 15, 21, 55, 111, 251],
    'LDA': [3, 5, 9, 15, 21, 55, 111, 251]
}

KNN_PCA_grayscale_stats = []
KNN_LDA_grayscale_stats = []

KNN_PCA_original_stats = []
KNN_LDA_original_stats = []

for k_idx, k in enumerate(k_to_test['PCA']):

    KNN_PCA_grayscale_stats.insert(k_idx,[k])

    for n_components_idx, n_components in enumerate(n_components_to_test['PCA']):
        knn = KNeighborsClassifier(k)

        knn.fit(PCAs_results['train']['grayscale'][n_components], dataset['train']['labels'])
        preds = knn.predict(PCAs_results['valid']['grayscale'][n_components])

        accuracy = round(accuracy_score(dataset['valid']['labels'], preds), 3)
        precision = round(precision_score(dataset['valid']['labels'], preds, average='macro'),3)
        
        # logger.info([f"KNN on PCA (grayscale images, k = {k}, {n_components} components)", "accuracy", accuracy, "precision", precision])

        KNN_PCA_grayscale_stats[k_idx].insert(n_components_idx + 1,(accuracy, precision))

KNN_PCA_df = pd.DataFrame(KNN_PCA_grayscale_stats, columns=['k\\PCA (grayscake)'] + n_components_to_test['PCA'])
display(KNN_PCA_df)

for k_idx,k in enumerate(k_to_test['LDA']):
    
    KNN_LDA_grayscale_stats.insert(k_idx,[k])
    
    for n_components_idx, n_components in enumerate(n_components_to_test['LDA']):
        knn = OneVsRestClassifier( KNeighborsClassifier(k))
        
        knn.fit(LDAs_results['train']['grayscale'][n_components], dataset['train']['labels'])
        preds = knn.predict(LDAs_results['valid']['grayscale'][n_components])

        accuracy = round(accuracy_score(dataset['valid']['labels'], preds), 3)
        precision = round(precision_score(dataset['valid']['labels'], preds, average='macro'),3)

        # logger.info([f"KNN on LDA (grayscale images, k = {k}, {n_components} components)", "accuracy", accuracy, "precision", precision])

        KNN_LDA_grayscale_stats[k_idx].insert(n_components_idx + 1,(accuracy, precision))


KNN_LDA_df = pd.DataFrame(KNN_LDA_grayscale_stats, columns=['k\\LDA (grayscale)'] + n_components_to_test['LDA'])
display(KNN_LDA_df)


  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,k\PCA (grayscake),100
0,3,"(0.16, 0.209)"
1,5,"(0.158, 0.195)"
2,9,"(0.146, 0.195)"
3,15,"(0.148, 0.228)"
4,21,"(0.134, 0.202)"
5,55,"(0.156, 0.19)"
6,111,"(0.174, 0.2)"
7,251,"(0.166, 0.197)"


Unnamed: 0,k\LDA (grayscale),5
0,3,"(0.132, 0.124)"
1,5,"(0.138, 0.136)"
2,9,"(0.14, 0.141)"
3,15,"(0.14, 0.136)"
4,21,"(0.14, 0.136)"
5,55,"(0.13, 0.125)"
6,111,"(0.128, 0.127)"
7,251,"(0.128, 0.13)"


In [5]:
# Classification - KRR

kernels_to_test = {
    'PCA': ['linear', 'poly', 'rbf'],
    'LDA': ['linear', 'poly', 'rbf']
}

KRR_PCA_grayscale_stats = []
KRR_LDA_grayscale_stats = []

for kernel_idx,kernel in enumerate(kernels_to_test['PCA']):

    KRR_PCA_grayscale_stats.insert(kernel_idx,[kernel])
    
    for n_components_idx, n_components in enumerate(n_components_to_test['PCA']):#n_components_to_test['PCA']):
        svm = OneVsOneClassifier(KernelRidge(kernel=kernel))

        svm.fit(PCAs_results['train']['grayscale'][n_components], dataset['train']['labels'])

        preds = svm.predict(PCAs_results['valid']['grayscale'][n_components])

        accuracy = round(accuracy_score(dataset['valid']['labels'], preds), 3)
        precision = round(precision_score(dataset['valid']['labels'], preds, average='macro'),3)

        KRR_PCA_grayscale_stats[kernel_idx].insert(n_components_idx + 1,(accuracy, precision))

KRR_PCA_df = pd.DataFrame(KRR_PCA_grayscale_stats, columns=['kernel\\PCA (grayscale)'] + n_components_to_test['PCA'])
display(KRR_PCA_df)

for kernel_idx,kernel in enumerate(kernels_to_test['LDA']):

    KRR_LDA_grayscale_stats.insert(kernel_idx,[kernel])
    
    for n_components_idx, n_components in enumerate(n_components_to_test['LDA']):
        svm = OneVsOneClassifier(KernelRidge(kernel=kernel))

        svm.fit(LDAs_results['train']['grayscale'][n_components], dataset['train']['labels'])

        preds = svm.predict(LDAs_results['valid']['grayscale'][n_components])

        accuracy = round(accuracy_score(dataset['valid']['labels'], preds), 3)
        precision = round(precision_score(dataset['valid']['labels'], preds, average='macro'),3)

        KRR_LDA_grayscale_stats[kernel_idx].insert(n_components_idx + 1,(accuracy, precision))

KRR_LDA_df = pd.DataFrame(KRR_LDA_grayscale_stats, columns=['kernel\\LDA (grayscale)'] + n_components_to_test['LDA'])
display(KRR_LDA_df)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,kernel\PCA (grayscale),3,10,50,100,200,500,1200
0,linear,"(0.184, 0.106)","(0.208, 0.22)","(0.186, 0.25)","(0.144, 0.101)","(0.132, 0.112)","(0.126, 0.103)","(0.158, 0.144)"
1,poly,"(0.128, 0.026)","(0.076, 0.078)","(0.134, 0.177)","(0.134, 0.103)","(0.136, 0.14)","(0.142, 0.281)","(0.142, 0.182)"
2,rbf,"(0.138, 0.14)","(0.154, 0.154)","(0.164, 0.152)","(0.15, 0.138)","(0.162, 0.16)","(0.154, 0.155)","(0.136, 0.172)"


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,kernel\LDA (grayscale),3,5,7,9
0,linear,"(0.114, 0.077)","(0.114, 0.087)","(0.132, 0.085)","(0.134, 0.122)"
1,poly,"(0.092, 0.064)","(0.108, 0.077)","(0.104, 0.101)","(0.112, 0.11)"
2,rbf,"(0.13, 0.104)","(0.134, 0.1)","(0.156, 0.12)","(0.142, 0.126)"


In [6]:
# Classification - SVM

kernels_to_test = {
    'PCA': ['poly'],#['linear', 'poly', 'sigmoid'],
    'LDA': ['poly']#['linear', 'poly', 'sigmoid'],
}

SVM_PCA_grayscale_stats = []
SVM_LDA_grayscale_stats = []

for kernel_idx,kernel in enumerate(kernels_to_test['PCA']):

    SVM_PCA_grayscale_stats.insert(kernel_idx,[kernel])
    
    for n_components_idx, n_components in enumerate(n_components_to_test['PCA']):#n_components_to_test['PCA']):
        svm = OneVsOneClassifier(SVC(kernel=kernel))

        svm.fit(PCAs_results['train']['grayscale'][n_components], dataset['train']['labels'])

        preds = svm.predict(PCAs_results['valid']['grayscale'][n_components])

        accuracy = round(accuracy_score(dataset['valid']['labels'], preds), 3)
        precision = round(precision_score(dataset['valid']['labels'], preds, average='macro'),3)

        SVM_PCA_grayscale_stats[kernel_idx].insert(n_components_idx + 1,(accuracy, precision))

SVM_PCA_df = pd.DataFrame(SVM_PCA_grayscale_stats, columns=['kernel\\PCA (grayscale)'] + n_components_to_test['PCA'])
display(SVM_PCA_df)

for kernel_idx,kernel in enumerate(kernels_to_test['LDA']):

    SVM_LDA_grayscale_stats.insert(kernel_idx,[kernel])
    
    for n_components_idx, n_components in enumerate(n_components_to_test['LDA']):
        svm = OneVsOneClassifier(SVC(kernel=kernel))

        svm.fit(LDAs_results['train']['grayscale'][n_components], dataset['train']['labels'])

        preds = svm.predict(LDAs_results['valid']['grayscale'][n_components])

        accuracy = round(accuracy_score(dataset['valid']['labels'], preds), 3)
        precision = round(precision_score(dataset['valid']['labels'], preds, average='macro'),3)

        SVM_LDA_grayscale_stats[kernel_idx].insert(n_components_idx + 1,(accuracy, precision))

SVM_LDA_df = pd.DataFrame(SVM_LDA_grayscale_stats, columns=['kernel\\LDA (grayscale)'] + n_components_to_test['LDA'])
display(SVM_LDA_df)

KeyboardInterrupt: 

In [8]:
# Classification - QDA

kernels_to_test = {
    'PCA': [''],#['linear', 'poly', 'sigmoid'],
    'LDA': ['']#['linear', 'poly', 'sigmoid'],
}

QDA_PCA_grayscale_stats = []
QDA_LDA_grayscale_stats = []

for kernel_idx,kernel in enumerate(kernels_to_test['PCA']):

    QDA_PCA_grayscale_stats.insert(kernel_idx,[kernel])
    
    for n_components_idx, n_components in enumerate(n_components_to_test['PCA']):#n_components_to_test['PCA']):
        qda = OneVsOneClassifier(QuadraticDiscriminantAnalysis())

        qda.fit(PCAs_results['train']['grayscale'][n_components], dataset['train']['labels'])

        preds = qda.predict(PCAs_results['valid']['grayscale'][n_components])

        accuracy = round(accuracy_score(dataset['valid']['labels'], preds), 3)
        precision = round(precision_score(dataset['valid']['labels'], preds, average='macro'),3)

        QDA_PCA_grayscale_stats[kernel_idx].insert(n_components_idx + 1,(accuracy, precision))

QDA_PCA_df = pd.DataFrame(QDA_PCA_grayscale_stats, columns=['kernel\\PCA (grayscale)'] + n_components_to_test['PCA'])
display(QDA_PCA_df)

for kernel_idx,kernel in enumerate(kernels_to_test['LDA']):

    QDA_LDA_grayscale_stats.insert(kernel_idx,[kernel])

    for n_components_idx, n_components in enumerate(n_components_to_test['LDA']):#n_components_to_test['PCA']):

        qda = OneVsOneClassifier(QuadraticDiscriminantAnalysis())

        qda.fit(LDAs_results['train']['grayscale'][n_components], dataset['train']['labels'])

        preds = qda.predict(LDAs_results['valid']['grayscale'][n_components])

        accuracy = round(accuracy_score(dataset['valid']['labels'], preds), 3)
        precision = round(precision_score(dataset['valid']['labels'], preds, average='macro'),3)

        QDA_LDA_grayscale_stats[kernel_idx].insert(n_components_idx + 1,(accuracy, precision))

QDA_original_grayscale_df = pd.DataFrame(QDA_LDA_grayscale_stats, columns=['kernel\\LDA (grayscale)']+ n_components_to_test['LDA'])
display(QDA_original_grayscale_df)



Unnamed: 0,kernel\PCA (grayscale),3,10,50,100,200,500,1200
0,,"(0.198, 0.187)","(0.244, 0.226)","(0.192, 0.192)","(0.168, 0.188)","(0.11, 0.11)","(0.108, 0.113)","(0.08, 0.083)"


Unnamed: 0,kernel\LDA (grayscale),3,5,7,9
0,,"(0.128, 0.145)","(0.122, 0.138)","(0.12, 0.167)","(0.128, 0.19)"


In [14]:
# GMM

GMM_n_components_to_test = {
    'PCA': [15, 30, 100, 500]
}

GMM_PCA_score = []

for GMM_n_components_idx,GMM_n_components in enumerate(GMM_n_components_to_test['PCA']):

    GMM_PCA_score.insert(GMM_n_components_idx,[GMM_n_components])
    
    for n_components_idx, n_components in enumerate(n_components_to_test['PCA']):
        gmm = GaussianMixture(n_components=GMM_n_components)

        gmm.fit(PCAs_results['train']['grayscale'][n_components])

        preds = gmm.predict(PCAs_results['valid']['grayscale'][n_components])

        rand_score = round(adjusted_rand_score(dataset['valid']['labels'], preds),3)

        GMM_PCA_score[GMM_n_components_idx].insert(n_components_idx + 1,(rand_score))

GMM_PCA_stats = pd.DataFrame(GMM_PCA_score, columns=['components\\PCA components'] + n_components_to_test['PCA'])

display(GMM_PCA_stats)