In [1]:
from PIL import Image,ImageOps
import io
import numpy as np
from matplotlib import pyplot as plt
from package.utils.logger import logger
import torch

from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.mixture import GaussianMixture
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.kernel_ridge import KernelRidge

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

import torchvision.models as models
from torchvision import transforms
from torchvision.models import VGG16_Weights


from sklearn.metrics import accuracy_score, precision_score, adjusted_rand_score

import pandas as pd
from IPython.display import display

In [2]:

# Bootstrap
raw_dataset = np.load('.ds.tiny/dataset.zip')

dataset = {
    'train': {
        'data': [],
        'names': [],
        'labels': [],
        'unique_labels': [],
    },
    'valid': {
        'data': [],
        'names': [],
        'labels': [],
        'unique_labels': [],
    }
}

images_shape = (200,200)

# For each image we have the path from which we extract the name and the label of the image
for dsKey in raw_dataset.keys():
    splittedKey = dsKey.split('/')

    img_type = splittedKey[2]
    img_label = splittedKey[3]
    img_name = splittedKey[4]
    
    img = Image.open(io.BytesIO(raw_dataset[dsKey]))
    img = ImageOps.fit(img,images_shape, Image.Resampling.LANCZOS)
    
    img_array = np.asarray(img).reshape(images_shape[0]*images_shape[1], 3)
    
    dataset[img_type]['data'].append(img_array)
    dataset[img_type]['names'].append(img_name)
    dataset[img_type]['labels'].append(img_label)

for img_type in dataset.keys():
    dataset[img_type]['data'] = np.asarray(dataset[img_type]['data'])
    dataset[img_type]['names'] = np.asarray(dataset[img_type]['names'])

    dataset[img_type]['unique_labels'], dataset[img_type]['labels'] = np.unique(np.asarray(dataset[img_type]['labels']), return_inverse=True)

    logger.info([f'data shape({img_type})', dataset[img_type]['data'].shape])
    logger.info([f'data labels({img_type})', dataset[img_type]['labels'].shape])
    logger.info([f'data unique labels({img_type})', dataset[img_type]['unique_labels']])


DEFAULT_LOGGER: 2024-07-02 17:23:10,242 | INFO | 2472717071.py:44 ['data shape(train)', (1500, 40000, 3)]
DEFAULT_LOGGER: 2024-07-02 17:23:10,244 | INFO | 2472717071.py:45 ['data labels(train)', (1500,)]
DEFAULT_LOGGER: 2024-07-02 17:23:10,245 | INFO | 2472717071.py:46 ['data unique labels(train)', array(['apple_pie', 'bibimbap', 'cannoli', 'edamame', 'falafel',
       'french_toast', 'ice_cream', 'ramen', 'sushi', 'tiramisu'],
      dtype='<U12')]
DEFAULT_LOGGER: 2024-07-02 17:23:10,297 | INFO | 2472717071.py:44 ['data shape(valid)', (500, 40000, 3)]
DEFAULT_LOGGER: 2024-07-02 17:23:10,300 | INFO | 2472717071.py:45 ['data labels(valid)', (500,)]
DEFAULT_LOGGER: 2024-07-02 17:23:10,301 | INFO | 2472717071.py:46 ['data unique labels(valid)', array(['apple_pie', 'bibimbap', 'cannoli', 'edamame', 'falafel',
       'french_toast', 'ice_cream', 'ramen', 'sushi', 'tiramisu'],
      dtype='<U12')]


In [21]:
# Riduzione della dimensionalità
n_components_to_test = {"PCA": [3], #[3, 10, 50, 100, 200, 500, 1200], 
                        "LDA": [1], }# [3, 5, 7, 9]}

results_PCA = []
results_LDA = []

# Grayscaled data
grayscale_train_images = np.mean(dataset["train"]["data"], axis=2)
grayscale_valid_images = np.mean(dataset["valid"]["data"], axis=2)

for n_components in n_components_to_test["PCA"]:
    PCA_original = [
        make_pipeline(StandardScaler(), PCA(n_components=n_components)),
        make_pipeline(StandardScaler(), PCA(n_components=n_components)),
        make_pipeline(StandardScaler(), PCA(n_components=n_components)),
    ]

    PCA_grayscale = make_pipeline(StandardScaler(), PCA(n_components=n_components))

    # Multichannel section
    for i in range(3):
        PCA_original[i].fit(dataset["train"]["data"][:, :, i])
        train_transformed = PCA_original[i].transform(
            dataset["train"]["data"][:, :,  i]
        )
        valid_transformed = PCA_original[i].transform(
            dataset["valid"]["data"][:, :,  i]
        )
        explained_variance_ratio_sum = np.sum(
            PCA_original[i][1].explained_variance_ratio_, axis=0
        )

        results_PCA.append(
            {
                "method": "PCA",
                "n_components": n_components,
                "channel": i,
                "explained_variance_ratio_sum": explained_variance_ratio_sum,
            }
        )

    # Grayscale section
    PCA_grayscale.fit(grayscale_train_images)
    train_transformed = PCA_grayscale.transform(grayscale_train_images)
    valid_transformed = PCA_grayscale.transform(grayscale_valid_images)
    explained_variance_ratio_sum = np.sum(
        PCA_grayscale[1].explained_variance_ratio_, axis=0
    )

    results_PCA.append(
        {
            "method": "PCA",
            "n_components": n_components,
            "channel": "Grayscale",
            "explained_variance_ratio_sum": explained_variance_ratio_sum,
        }
    )

for n_components in n_components_to_test["LDA"]:
    LDA_original = [
        make_pipeline(
            StandardScaler(), LinearDiscriminantAnalysis(n_components=n_components)
        ),
        make_pipeline(
            StandardScaler(), LinearDiscriminantAnalysis(n_components=n_components)
        ),
        make_pipeline(
            StandardScaler(), LinearDiscriminantAnalysis(n_components=n_components)
        ),
    ]

    LDA_grayscale = make_pipeline(
        StandardScaler(), LinearDiscriminantAnalysis(n_components=n_components)
    )

    # Multichannel section
    for i in range(3):
        LDA_original[i].fit(
            dataset["train"]["data"][:, :, i], dataset["train"]["labels"]
        )
        train_transformed = LDA_original[i].transform(
            dataset["train"]["data"][:, :, i]
        )
        valid_transformed = LDA_original[i].transform(
            dataset["valid"]["data"][:, :, i]
        )
        explained_variance_ratio_sum = np.sum(
            LDA_original[i][1].explained_variance_ratio_, axis=0
        )

        results_LDA.append(
            {
                "method": "LDA",
                "n_components": n_components,
                "channel": i,
                "explained_variance_ratio_sum": explained_variance_ratio_sum,
            }
        )

    # Grayscale section
    LDA_grayscale.fit(grayscale_train_images, dataset["train"]["labels"])
    train_transformed = LDA_grayscale.transform(grayscale_train_images)
    valid_transformed = LDA_grayscale.transform(grayscale_valid_images)
    explained_variance_ratio_sum = np.sum(
        LDA_grayscale[1].explained_variance_ratio_, axis=0
    )

    results_LDA.append(
        {
            "method": "LDA",
            "n_components": n_components,
            "channel": "Grayscale",
            "explained_variance_ratio_sum": explained_variance_ratio_sum,
        }
    )

# Conversione dei risultati in un DataFrame
df_results_PCA = pd.DataFrame(results_PCA)
df_results_LDA = pd.DataFrame(results_LDA)


display(df_results_PCA)
display(df_results_LDA)

Unnamed: 0,method,n_components,channel,explained_variance_ratio_sum
0,PCA,3,0,0.382316
1,PCA,3,1,0.341308
2,PCA,3,2,0.37361
3,PCA,3,Grayscale,0.34685


Unnamed: 0,method,n_components,channel,explained_variance_ratio_sum
0,LDA,1,0,0.199572
1,LDA,1,1,0.369997
2,LDA,1,2,0.449102
3,LDA,1,Grayscale,0.341293


In [5]:
# Classification - KNN

k_to_test = {
    'PCA': [3, 5, 9, 15, 21, 55, 111, 251],
    'LDA': [3, 5, 9, 15, 21, 55, 111, 251]
}

KNN_PCA_grayscale_stats = []
KNN_LDA_grayscale_stats = []

KNN_PCA_original_stats = []
KNN_LDA_original_stats = []

for k_idx, k in enumerate(k_to_test['PCA']):

    KNN_PCA_grayscale_stats.insert(k_idx,[k])

    for n_components_idx, n_components in enumerate(n_components_to_test['PCA']):
        knn = KNeighborsClassifier(k)

        knn.fit(PCAs_results['train']['grayscale'][n_components], dataset['train']['labels'])
        preds = knn.predict(PCAs_results['valid']['grayscale'][n_components])

        accuracy = round(accuracy_score(dataset['valid']['labels'], preds), 3)
        precision = round(precision_score(dataset['valid']['labels'], preds, average='macro'),3)
        
        # logger.info([f"KNN on PCA (grayscale images, k = {k}, {n_components} components)", "accuracy", accuracy, "precision", precision])

        KNN_PCA_grayscale_stats[k_idx].insert(n_components_idx + 1,(accuracy, precision))

KNN_PCA_df = pd.DataFrame(KNN_PCA_grayscale_stats, columns=['k\\PCA (grayscale)'] + n_components_to_test['PCA'])
display(KNN_PCA_df)

for k_idx,k in enumerate(k_to_test['LDA']):
    
    KNN_LDA_grayscale_stats.insert(k_idx,[k])
    
    for n_components_idx, n_components in enumerate(n_components_to_test['LDA']):
        knn = OneVsRestClassifier( KNeighborsClassifier(k))
        
        knn.fit(LDAs_results['train']['grayscale'][n_components], dataset['train']['labels'])
        preds = knn.predict(LDAs_results['valid']['grayscale'][n_components])

        accuracy = round(accuracy_score(dataset['valid']['labels'], preds), 3)
        precision = round(precision_score(dataset['valid']['labels'], preds, average='macro'),3)

        # logger.info([f"KNN on LDA (grayscale images, k = {k}, {n_components} components)", "accuracy", accuracy, "precision", precision])

        KNN_LDA_grayscale_stats[k_idx].insert(n_components_idx + 1,(accuracy, precision))


KNN_LDA_df = pd.DataFrame(KNN_LDA_grayscale_stats, columns=['k\\LDA (grayscale)'] + n_components_to_test['LDA'])
display(KNN_LDA_df)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,k\PCA (grayscale),3,10,50,100,200,500,1200
0,3,"(0.15, 0.148)","(0.168, 0.165)","(0.148, 0.189)","(0.156, 0.213)","(0.146, 0.205)","(0.134, 0.164)","(0.138, 0.142)"
1,5,"(0.156, 0.157)","(0.166, 0.183)","(0.154, 0.186)","(0.156, 0.19)","(0.154, 0.217)","(0.146, 0.217)","(0.144, 0.19)"
2,9,"(0.172, 0.168)","(0.158, 0.165)","(0.156, 0.206)","(0.146, 0.2)","(0.132, 0.2)","(0.132, 0.193)","(0.128, 0.202)"
3,15,"(0.17, 0.164)","(0.166, 0.174)","(0.152, 0.191)","(0.154, 0.228)","(0.134, 0.187)","(0.128, 0.205)","(0.13, 0.243)"
4,21,"(0.182, 0.18)","(0.174, 0.183)","(0.156, 0.198)","(0.134, 0.205)","(0.134, 0.213)","(0.138, 0.179)","(0.138, 0.189)"
5,55,"(0.202, 0.187)","(0.186, 0.19)","(0.16, 0.186)","(0.156, 0.196)","(0.162, 0.214)","(0.134, 0.149)","(0.134, 0.142)"
6,111,"(0.184, 0.161)","(0.21, 0.19)","(0.178, 0.208)","(0.164, 0.179)","(0.162, 0.192)","(0.16, 0.22)","(0.154, 0.161)"
7,251,"(0.208, 0.204)","(0.234, 0.273)","(0.182, 0.206)","(0.16, 0.189)","(0.16, 0.173)","(0.158, 0.149)","(0.162, 0.156)"


Unnamed: 0,k\LDA (grayscale),3,5,7,9
0,3,"(0.144, 0.125)","(0.128, 0.122)","(0.158, 0.156)","(0.152, 0.146)"
1,5,"(0.154, 0.139)","(0.144, 0.14)","(0.162, 0.155)","(0.162, 0.154)"
2,9,"(0.138, 0.123)","(0.148, 0.144)","(0.164, 0.153)","(0.146, 0.136)"
3,15,"(0.144, 0.131)","(0.158, 0.154)","(0.168, 0.158)","(0.154, 0.144)"
4,21,"(0.136, 0.122)","(0.144, 0.144)","(0.162, 0.153)","(0.154, 0.147)"
5,55,"(0.12, 0.107)","(0.148, 0.147)","(0.158, 0.146)","(0.154, 0.146)"
6,111,"(0.128, 0.122)","(0.142, 0.141)","(0.168, 0.155)","(0.152, 0.144)"
7,251,"(0.118, 0.113)","(0.148, 0.148)","(0.16, 0.147)","(0.148, 0.141)"


In [None]:
# Classification - KRR

kernels_to_test = {
    'PCA': ['linear', 'poly', 'rbf'],
    'LDA': ['linear', 'poly', 'rbf']
}

KRR_PCA_grayscale_stats = []
KRR_LDA_grayscale_stats = []

for kernel_idx,kernel in enumerate(kernels_to_test['PCA']):

    KRR_PCA_grayscale_stats.insert(kernel_idx,[kernel])
    
    for n_components_idx, n_components in enumerate(n_components_to_test['PCA']):#n_components_to_test['PCA']):
        svm = OneVsOneClassifier(KernelRidge(kernel=kernel))

        svm.fit(PCAs_results['train']['grayscale'][n_components], dataset['train']['labels'])

        preds = svm.predict(PCAs_results['valid']['grayscale'][n_components])

        accuracy = round(accuracy_score(dataset['valid']['labels'], preds), 3)
        precision = round(precision_score(dataset['valid']['labels'], preds, average='macro'),3)

        KRR_PCA_grayscale_stats[kernel_idx].insert(n_components_idx + 1,(accuracy, precision))

KRR_PCA_df = pd.DataFrame(KRR_PCA_grayscale_stats, columns=['kernel\\PCA (grayscale)'] + n_components_to_test['PCA'])
display(KRR_PCA_df)

for kernel_idx,kernel in enumerate(kernels_to_test['LDA']):

    KRR_LDA_grayscale_stats.insert(kernel_idx,[kernel])
    
    for n_components_idx, n_components in enumerate(n_components_to_test['LDA']):
        svm = OneVsOneClassifier(KernelRidge(kernel=kernel))

        svm.fit(LDAs_results['train']['grayscale'][n_components], dataset['train']['labels'])

        preds = svm.predict(LDAs_results['valid']['grayscale'][n_components])

        accuracy = round(accuracy_score(dataset['valid']['labels'], preds), 3)
        precision = round(precision_score(dataset['valid']['labels'], preds, average='macro'),3)

        KRR_LDA_grayscale_stats[kernel_idx].insert(n_components_idx + 1,(accuracy, precision))

KRR_LDA_df = pd.DataFrame(KRR_LDA_grayscale_stats, columns=['kernel\\LDA (grayscale)'] + n_components_to_test['LDA'])
display(KRR_LDA_df)

In [7]:
# Classification - SVM

kernels_to_test = {
    'PCA': ['linear', 'poly', 'sigmoid'],
    'LDA': ['linear', 'poly', 'sigmoid'],
}

SVM_PCA_grayscale_stats = []
SVM_LDA_grayscale_stats = []

for kernel_idx,kernel in enumerate(kernels_to_test['PCA']):

    SVM_PCA_grayscale_stats.insert(kernel_idx,[kernel])
    
    for n_components_idx, n_components in enumerate(n_components_to_test['PCA']):#n_components_to_test['PCA']):
        svm = OneVsOneClassifier(SVC(kernel=kernel))

        svm.fit(PCAs_results['train']['grayscale'][n_components], dataset['train']['labels'])

        preds = svm.predict(PCAs_results['valid']['grayscale'][n_components])

        accuracy = round(accuracy_score(dataset['valid']['labels'], preds), 3)
        precision = round(precision_score(dataset['valid']['labels'], preds, average='macro'),3)

        SVM_PCA_grayscale_stats[kernel_idx].insert(n_components_idx + 1,(accuracy, precision))

SVM_PCA_df = pd.DataFrame(SVM_PCA_grayscale_stats, columns=['kernel\\PCA (grayscale)'] + n_components_to_test['PCA'])
display(SVM_PCA_df)

for kernel_idx,kernel in enumerate(kernels_to_test['LDA']):

    SVM_LDA_grayscale_stats.insert(kernel_idx,[kernel])
    
    for n_components_idx, n_components in enumerate(n_components_to_test['LDA']):
        svm = OneVsOneClassifier(SVC(kernel=kernel))

        svm.fit(LDAs_results['train']['grayscale'][n_components], dataset['train']['labels'])

        preds = svm.predict(LDAs_results['valid']['grayscale'][n_components])

        accuracy = round(accuracy_score(dataset['valid']['labels'], preds), 3)
        precision = round(precision_score(dataset['valid']['labels'], preds, average='macro'),3)

        SVM_LDA_grayscale_stats[kernel_idx].insert(n_components_idx + 1,(accuracy, precision))

SVM_LDA_df = pd.DataFrame(SVM_LDA_grayscale_stats, columns=['kernel\\LDA (grayscale)'] + n_components_to_test['LDA'])
display(SVM_LDA_df)

Unnamed: 0,kernel\PCA (grayscale),3,10,50,100,200,500,1200
0,linear,"(0.196, 0.167)","(0.23, 0.208)","(0.188, 0.179)","(0.194, 0.187)","(0.188, 0.188)","(0.18, 0.183)","(0.192, 0.193)"
1,poly,"(0.164, 0.197)","(0.21, 0.236)","(0.206, 0.23)","(0.208, 0.235)","(0.206, 0.243)","(0.198, 0.242)","(0.208, 0.258)"
2,sigmoid,"(0.14, 0.11)","(0.16, 0.143)","(0.204, 0.183)","(0.222, 0.198)","(0.228, 0.203)","(0.232, 0.212)","(0.234, 0.214)"


Unnamed: 0,kernel\LDA (grayscale),3,5,7,9
0,linear,"(0.124, 0.099)","(0.124, 0.138)","(0.134, 0.139)","(0.136, 0.147)"
1,poly,"(0.126, 0.131)","(0.124, 0.119)","(0.156, 0.156)","(0.134, 0.136)"
2,sigmoid,"(0.134, 0.132)","(0.136, 0.137)","(0.148, 0.142)","(0.15, 0.135)"


In [None]:
# Classification - QDA

kernels_to_test = {
    'PCA': [''],#['linear', 'poly', 'sigmoid'],
    'LDA': ['']#['linear', 'poly', 'sigmoid'],
}

QDA_PCA_grayscale_stats = []
QDA_LDA_grayscale_stats = []

for kernel_idx,kernel in enumerate(kernels_to_test['PCA']):

    QDA_PCA_grayscale_stats.insert(kernel_idx,[kernel])
    
    for n_components_idx, n_components in enumerate(n_components_to_test['PCA']):#n_components_to_test['PCA']):
        qda = OneVsOneClassifier(QuadraticDiscriminantAnalysis())

        qda.fit(PCAs_results['train']['grayscale'][n_components], dataset['train']['labels'])

        preds = qda.predict(PCAs_results['valid']['grayscale'][n_components])

        accuracy = round(accuracy_score(dataset['valid']['labels'], preds), 3)
        precision = round(precision_score(dataset['valid']['labels'], preds, average='macro'),3)

        QDA_PCA_grayscale_stats[kernel_idx].insert(n_components_idx + 1,(accuracy, precision))

QDA_PCA_df = pd.DataFrame(QDA_PCA_grayscale_stats, columns=['kernel\\PCA (grayscale)'] + n_components_to_test['PCA'])
display(QDA_PCA_df)

for kernel_idx,kernel in enumerate(kernels_to_test['LDA']):

    QDA_LDA_grayscale_stats.insert(kernel_idx,[kernel])

    for n_components_idx, n_components in enumerate(n_components_to_test['LDA']):#n_components_to_test['PCA']):

        qda = OneVsOneClassifier(QuadraticDiscriminantAnalysis())

        qda.fit(LDAs_results['train']['grayscale'][n_components], dataset['train']['labels'])

        preds = qda.predict(LDAs_results['valid']['grayscale'][n_components])

        accuracy = round(accuracy_score(dataset['valid']['labels'], preds), 3)
        precision = round(precision_score(dataset['valid']['labels'], preds, average='macro'),3)

        QDA_LDA_grayscale_stats[kernel_idx].insert(n_components_idx + 1,(accuracy, precision))

QDA_original_grayscale_df = pd.DataFrame(QDA_LDA_grayscale_stats, columns=['kernel\\LDA (grayscale)']+ n_components_to_test['LDA'])
display(QDA_original_grayscale_df)

In [5]:
# GMM

GMM_n_components_to_test = {
    'PCA': [15, 30, 100, 500]
}

GMM_PCA_score = []

for GMM_n_components_idx,GMM_n_components in enumerate(GMM_n_components_to_test['PCA']):

    GMM_PCA_score.insert(GMM_n_components_idx,[GMM_n_components])
    
    for n_components_idx, n_components in enumerate(n_components_to_test['PCA']):
        gmm = GaussianMixture(n_components=GMM_n_components)

        gmm.fit(PCAs_results['train']['grayscale'][n_components])

        preds = gmm.predict(PCAs_results['valid']['grayscale'][n_components])

        rand_score = round(adjusted_rand_score(dataset['valid']['labels'], preds),3)

        GMM_PCA_score[GMM_n_components_idx].insert(n_components_idx + 1,(rand_score))

GMM_PCA_stats = pd.DataFrame(GMM_PCA_score, columns=['components\\PCA components'] + n_components_to_test['PCA'])

display(GMM_PCA_stats)

Unnamed: 0,components\PCA components,3,10,50,100,200,500,1200
0,15,0.017,0.011,0.014,0.001,0.0,0.003,0.0
1,30,0.013,0.011,0.005,0.0,-0.001,0.003,0.004
2,100,0.009,0.013,0.0,0.005,0.006,0.008,0.002
3,500,0.002,0.006,0.008,0.005,0.002,0.01,0.018
