In [19]:
from PIL import Image,ImageOps
import io
import numpy as np
from matplotlib import pyplot as plt
from package.utils.logger import logger
import torch

from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.mixture import GaussianMixture
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

import torchvision.models as models
from torchvision import transforms
from torchvision.models import VGG16_Weights


from sklearn.metrics import accuracy_score, precision_score, adjusted_rand_score

import pandas as pd
from IPython.display import display

In [3]:

# Bootstrap
raw_dataset = np.load('.ds.tiny/dataset.zip')

dataset = {
    'train': {
        'data': [],
        'names': [],
        'labels': [],
        'unique_labels': [],
    },
    'valid': {
        'data': [],
        'names': [],
        'labels': [],
        'unique_labels': [],
    }
}

images_shape = (200,200)

# For each image we have the path from which we extract the name and the label of the image
for dsKey in raw_dataset.keys():
    splittedKey = dsKey.split('/')

    img_type = splittedKey[2]
    img_label = splittedKey[3]
    img_name = splittedKey[4]
    
    img = Image.open(io.BytesIO(raw_dataset[dsKey]))
    img = ImageOps.fit(img,images_shape, Image.Resampling.LANCZOS).convert('P')
    
    img_array = np.asarray(img).reshape(images_shape[0]*images_shape[1])
    
    dataset[img_type]['data'].append(img_array)
    dataset[img_type]['names'].append(img_name)
    dataset[img_type]['labels'].append(img_label)

for img_type in dataset.keys():
    dataset[img_type]['data'] = np.asarray(dataset[img_type]['data'])
    dataset[img_type]['names'] = np.asarray(dataset[img_type]['names'])

    dataset[img_type]['unique_labels'], dataset[img_type]['labels'] = np.unique(np.asarray(dataset[img_type]['labels']), return_inverse=True)

    logger.info([f'data shape({img_type})', dataset[img_type]['data'].shape])
    logger.info([f'data labels({img_type})', dataset[img_type]['labels'].shape])
    logger.info([f'data unique labels({img_type})', dataset[img_type]['unique_labels']])


DEFAULT_LOGGER: 2024-06-13 18:26:45,220 | INFO | 4067204959.py:44 ['data shape(train)', (1500, 40000)]
DEFAULT_LOGGER: 2024-06-13 18:26:45,221 | INFO | 4067204959.py:45 ['data labels(train)', (1500,)]
DEFAULT_LOGGER: 2024-06-13 18:26:45,223 | INFO | 4067204959.py:46 ['data unique labels(train)', array(['apple_pie', 'bibimbap', 'cannoli', 'edamame', 'falafel',
       'french_toast', 'ice_cream', 'ramen', 'sushi', 'tiramisu'],
      dtype='<U12')]
DEFAULT_LOGGER: 2024-06-13 18:26:45,229 | INFO | 4067204959.py:44 ['data shape(valid)', (500, 40000)]
DEFAULT_LOGGER: 2024-06-13 18:26:45,231 | INFO | 4067204959.py:45 ['data labels(valid)', (500,)]
DEFAULT_LOGGER: 2024-06-13 18:26:45,232 | INFO | 4067204959.py:46 ['data unique labels(valid)', array(['apple_pie', 'bibimbap', 'cannoli', 'edamame', 'falafel',
       'french_toast', 'ice_cream', 'ramen', 'sushi', 'tiramisu'],
      dtype='<U12')]


In [30]:
# Dimensionality reduction

n_components_to_test = {
    'PCA': [120,1000],#[3, 10, 50, 100, 200, 500, 1200],
    'LDA': [7]#[3, 5, 7, 9]    
}

PCAs_instances = {}

LDAs_instances = {}

PCAs_results = {
    'train': {},
    'valid': {},
}

LDAs_results = {
    'train': {},
    'valid': {},
}

for n_components in n_components_to_test['PCA']:
    PCAs_instances[n_components] = []

    PCAs_results['train'][n_components] = []
    PCAs_results['valid'][n_components] = []

    PCA_instance = make_pipeline(
        StandardScaler(),
        PCA(n_components=n_components)
    )
    
    PCA_instance.fit(dataset['train']['data'])

    PCAs_results['train'][n_components] = PCA_instance.transform(dataset['train']['data'])
    PCAs_results['valid'][n_components] = PCA_instance.transform(dataset['valid']['data']) 

    PCAs_instances[n_components] = PCA_instance

    logger.info([f'PCA ({n_components} components): explained_variance_ratio sum', np.sum(PCA_instance[1].explained_variance_ratio_,axis=0)])


for n_components in n_components_to_test['LDA']:

    LDAs_instances[n_components] = []

    LDAs_results['train'][n_components] = []
    LDAs_results['valid'][n_components] = []

    LDA_instance = make_pipeline(
        StandardScaler(),
        LinearDiscriminantAnalysis(n_components=n_components)
    )
    
    LDA_instance.fit(dataset['train']['data'], dataset['train']['labels'])

    LDAs_results['train'][n_components] = LDA_instance.transform(dataset['train']['data'])
    LDAs_results['valid'][n_components] = LDA_instance.transform(dataset['valid']['data']) 

    LDAs_instances[n_components] = LDA_instance

    logger.info([f'LDA ({n_components} components): explained_variance_ratio sum', np.sum(LDA_instance[1].explained_variance_ratio_,axis=0)])


DEFAULT_LOGGER: 2024-06-13 18:38:35,371 | INFO | 2400584345.py:40 ['PCA (grayscale image, 120 components): explained_variance_ratio sum', 0.7869457025728887]
DEFAULT_LOGGER: 2024-06-13 18:38:59,463 | INFO | 2400584345.py:40 ['PCA (grayscale image, 1000 components): explained_variance_ratio sum', 0.9677803302137998]
DEFAULT_LOGGER: 2024-06-13 18:39:12,378 | INFO | 2400584345.py:62 ['LDA (grayscale image, 7 components): explained_variance_ratio sum', 0.8354278281803277]


In [31]:
# Classification - KNN

k_to_test = {
    'PCA': [11,19,27],#np.linspace(15,25, dtype=int),#[3, 5, 9, 15, 21, 55, 111, 251],
    'LDA': []#[3, 5, 9, 15, 21, 55, 111, 251]
}

KNN_PCA_stats = []
KNN_LDA_stats = []

for k_idx, k in enumerate(k_to_test['PCA']):

    KNN_PCA_stats.insert(k_idx,[k])

    for n_components_idx, n_components in enumerate(n_components_to_test['PCA']):
        knn = OneVsOneClassifier(KNeighborsClassifier(k))

        knn.fit(PCAs_results['train'][n_components], dataset['train']['labels'])
        preds = knn.predict(PCAs_results['valid'][n_components])

        accuracy = round(accuracy_score(dataset['valid']['labels'], preds), 3)
        precision = round(precision_score(dataset['valid']['labels'], preds, average='macro'),3)
        
        KNN_PCA_stats[k_idx].insert(n_components_idx + 1,(accuracy, precision))

        #ConfusionMatrixDisplay(confusion_matrix(dataset['valid']['labels'], preds),display_labels=dataset['valid']['unique_labels']).plot()

KNN_PCA_df = pd.DataFrame(KNN_PCA_stats, columns=['k\\PCA components'] + n_components_to_test['PCA'])
display(KNN_PCA_df)

for k_idx,k in enumerate(k_to_test['LDA']):
    
    KNN_LDA_stats.insert(k_idx,[k])
    
    for n_components_idx, n_components in enumerate(n_components_to_test['LDA']):
        knn = OneVsOneClassifier( KNeighborsClassifier(k))
        
        knn.fit(LDAs_results['train'][n_components], dataset['train']['labels'])
        preds = knn.predict(LDAs_results['valid'][n_components])

        accuracy = round(accuracy_score(dataset['valid']['labels'], preds), 3)
        precision = round(precision_score(dataset['valid']['labels'], preds, average='macro'),3)

        KNN_LDA_stats[k_idx].insert(n_components_idx + 1,(accuracy, precision))


KNN_LDA_df = pd.DataFrame(KNN_LDA_stats, columns=['k\\LDA components'] + n_components_to_test['LDA'])
display(KNN_LDA_df)

plt.show()

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,k\PCA components,120,1000
0,11,"(0.192, 0.211)","(0.166, 0.195)"
1,19,"(0.194, 0.285)","(0.168, 0.106)"
2,27,"(0.2, 0.206)","(0.184, 0.186)"


Unnamed: 0,k\LDA components,7


In [32]:
# Classification - SGD

losses_to_test = {
    'PCA': ['modified_huber', 'log_loss', 'hinge'],
    'LDA': ['modified_huber', 'log_loss', 'hinge'],#['modified_huber', 'log_loss', 'hinge']
}

SGD_PCA_grayscale_stats = []
SGD_LDA_grayscale_stats = []

for loss_idx,loss in enumerate(losses_to_test['PCA']):

    SGD_PCA_grayscale_stats.insert(loss_idx,[loss])
    
    for n_components_idx, n_components in enumerate(n_components_to_test['PCA']):#n_components_to_test['PCA']):
        svm = OneVsOneClassifier(SGDClassifier(loss=loss, max_iter=10000 ))

        svm.fit(PCAs_results['train'][n_components], dataset['train']['labels'])

        preds = svm.predict(PCAs_results['valid'][n_components])

        accuracy = round(accuracy_score(dataset['valid']['labels'], preds), 3)
        precision = round(precision_score(dataset['valid']['labels'], preds, average='macro'),3)

        SGD_PCA_grayscale_stats[loss_idx].insert(n_components_idx + 1,(accuracy, precision))
        # ConfusionMatrixDisplay(confusion_matrix(dataset['valid']['labels'], preds),display_labels=dataset['valid']['unique_labels']).plot()

SGD_PCA_df = pd.DataFrame(SGD_PCA_grayscale_stats, columns=['loss\\PCA (grayscale)'] + n_components_to_test['PCA'])
display(SGD_PCA_df)

for loss_idx,loss in enumerate(losses_to_test['LDA']):

    SGD_LDA_grayscale_stats.insert(loss_idx,[loss])
    
    for n_components_idx, n_components in enumerate(n_components_to_test['LDA']):
        svm = OneVsOneClassifier(SGDClassifier(loss=loss, max_iter=10000 ))

        svm.fit(LDAs_results['train'][n_components], dataset['train']['labels'])

        preds = svm.predict(LDAs_results['valid'][n_components])

        accuracy = round(accuracy_score(dataset['valid']['labels'], preds), 3)
        precision = round(precision_score(dataset['valid']['labels'], preds, average='macro'),3)

        SGD_LDA_grayscale_stats[loss_idx].insert(n_components_idx + 1,(accuracy, precision))

SGD_LDA_df = pd.DataFrame(SGD_LDA_grayscale_stats, columns=['loss\\LDA (grayscale)'] + n_components_to_test['LDA'])
display(SGD_LDA_df)

# plt.show()

Unnamed: 0,loss\PCA (grayscale),120,1000
0,modified_huber,"(0.194, 0.184)","(0.196, 0.191)"
1,log_loss,"(0.192, 0.183)","(0.186, 0.182)"
2,hinge,"(0.186, 0.181)","(0.18, 0.179)"


Unnamed: 0,loss\LDA (grayscale),7
0,modified_huber,"(0.178, 0.206)"
1,log_loss,"(0.186, 0.19)"
2,hinge,"(0.186, 0.184)"


In [None]:
# Classification - SVM

kernels_to_test = {
    'PCA': ['poly', 'sigmoid'],#['linear', 'poly', 'sigmoid'],
    'LDA': ['linear']#['linear', 'poly', 'sigmoid'],
}

SVM_PCA_stats = []
SVM_LDA_stats = []

for kernel_idx,kernel in enumerate(kernels_to_test['PCA']):

    SVM_PCA_stats.insert(kernel_idx,[kernel])
    
    for n_components_idx, n_components in enumerate(n_components_to_test['PCA']):#n_components_to_test['PCA']):
        svm = OneVsOneClassifier(SVC(kernel=kernel))

        svm.fit(PCAs_results['train'][n_components], dataset['train']['labels'])

        preds = svm.predict(PCAs_results['valid'][n_components])

        accuracy = round(accuracy_score(dataset['valid']['labels'], preds), 3)
        precision = round(precision_score(dataset['valid']['labels'], preds, average='macro'),3)

        SVM_PCA_stats[kernel_idx].insert(n_components_idx + 1,(accuracy, precision))

SVM_PCA_df = pd.DataFrame(SVM_PCA_stats, columns=['kernel\\PCA components'] + n_components_to_test['PCA'])
display(SVM_PCA_df)

for kernel_idx,kernel in enumerate(kernels_to_test['LDA']):

    SVM_LDA_stats.insert(kernel_idx,[kernel])
    
    for n_components_idx, n_components in enumerate(n_components_to_test['LDA']):
        svm = OneVsOneClassifier(SVC(kernel=kernel))

        svm.fit(LDAs_results['train'][n_components], dataset['train']['labels'])

        preds = svm.predict(LDAs_results['valid'][n_components])

        accuracy = round(accuracy_score(dataset['valid']['labels'], preds), 3)
        precision = round(precision_score(dataset['valid']['labels'], preds, average='macro'),3)

        SVM_LDA_stats[kernel_idx].insert(n_components_idx + 1,(accuracy, precision))

SVM_LDA_df = pd.DataFrame(SVM_LDA_stats, columns=['kernel\\LDA components'] + n_components_to_test['LDA'])
display(SVM_LDA_df)

Unnamed: 0,kernel\PCA components,120,1000
0,poly,"(0.216, 0.258)","(0.208, 0.268)"
1,sigmoid,"(0.226, 0.197)","(0.226, 0.21)"


Unnamed: 0,kernel\LDA components,7
0,poly,"(0.162, 0.204)"
1,sigmoid,"(0.164, 0.161)"
