In [14]:
import io
import pickle

from PIL import Image, ImageOps
import numpy as np
from matplotlib import pyplot as plt
import torch

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.mixture import GaussianMixture
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.kernel_ridge import KernelRidge

from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

from torch.utils.data import DataLoader, Dataset
import torchvision.models as models
from torchvision import transforms
from torchvision.models import VGG16_Weights

from sklearn.metrics import accuracy_score, precision_score, f1_score

import pandas as pd
from IPython.display import display

# Initialing compute device (use GPU if available).
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f'[torch] using {device}')

[torch] using cpu


### Dataset bootstrap

In [2]:

# Load the dataset
raw_dataset = np.load("dataset_food101tiny.zip", allow_pickle=True)

dataset = {
    "train": {
        "data": [],
        "names": [],
        "labels": [],
        "unique_labels": [],
    },
    "valid": {
        "data": [],
        "names": [],
        "labels": [],
        "unique_labels": [],
    },
}

images_shape = (224, 224)

# For each image we have the path from which we extract the name and the label of the image
for dsKey in raw_dataset.keys():
    splittedKey = dsKey.split("/")

    img_type = splittedKey[2]
    img_label = splittedKey[3]
    img_name = splittedKey[4]

    img = Image.open(io.BytesIO(raw_dataset[dsKey]))
    img = ImageOps.fit(img, images_shape, Image.Resampling.LANCZOS).convert("RGB")

    img_array = np.asarray(img)

    dataset[img_type]["data"].append(img_array)
    dataset[img_type]["names"].append(img_name)
    dataset[img_type]["labels"].append(img_label)

for img_type in dataset.keys():
    dataset[img_type]["data"] = np.asarray(dataset[img_type]["data"])
    dataset[img_type]["names"] = np.asarray(dataset[img_type]["names"])

    dataset[img_type]["unique_labels"], dataset[img_type]["labels"] = np.unique(
        np.asarray(dataset[img_type]["labels"]), return_inverse=True
    )

### Feature extraction using VGG

Normalization mean and standard deviation are [here](https://pytorch.org/hub/pytorch_vision_vgg)

In [6]:
normalization_std = [0.229, 0.224, 0.225]
normalization_mean = [0.485, 0.456, 0.406]

loader = transforms.Compose(
    [
        # transforms.ToPILImage(),
        transforms.ToTensor(),
        transforms.Normalize(mean=normalization_mean, std=normalization_std),
    ]
)

vgg_out = {"train": [], "valid": []}

# Initialize the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = models.vgg16(weights=models.VGG16_Weights.DEFAULT).features.to(device)

for img_type in dataset.keys():
    vgg_out[img_type] = []

    for image_idx in range(dataset[img_type]["data"].shape[0]):
        loaded_image = (
            loader(dataset[img_type]["data"][image_idx, :]).unsqueeze(0).to(device)
        )

        with torch.no_grad():
            res = model(loaded_image)
        features = res.data.detach().cpu().numpy().flatten()
        print(f"Extracting feature: {image_idx}/{dataset[img_type]['data'].shape[0]}")

        vgg_out[img_type].append(features)

    vgg_out[img_type] = np.asarray(vgg_out[img_type])
    print(vgg_out[img_type].shape)

pickle.dump(vgg_out, open(".pkl/vgg_out.pkl", "wb"))

Extracting feature: 0/1500
Extracting feature: 1/1500
Extracting feature: 2/1500
Extracting feature: 3/1500
Extracting feature: 4/1500
Extracting feature: 5/1500
Extracting feature: 6/1500
Extracting feature: 7/1500
Extracting feature: 8/1500
Extracting feature: 9/1500
Extracting feature: 10/1500
Extracting feature: 11/1500
Extracting feature: 12/1500
Extracting feature: 13/1500
Extracting feature: 14/1500
Extracting feature: 15/1500
Extracting feature: 16/1500
Extracting feature: 17/1500
Extracting feature: 18/1500
Extracting feature: 19/1500
Extracting feature: 20/1500
Extracting feature: 21/1500
Extracting feature: 22/1500
Extracting feature: 23/1500
Extracting feature: 24/1500
Extracting feature: 25/1500
Extracting feature: 26/1500
Extracting feature: 27/1500
Extracting feature: 28/1500
Extracting feature: 29/1500
Extracting feature: 30/1500
Extracting feature: 31/1500
Extracting feature: 32/1500
Extracting feature: 33/1500
Extracting feature: 34/1500
Extracting feature: 35/1500
Ex

In [3]:
# Each method is tested with a set of numer of components
n_components_to_test = {
    "PCA": [3, 10, 50, 100, 200, 500, 1200],
    "LDA": [3, 5, 7, 9],
    "TSNE": [2, 3],
}

In [4]:
# preload env
vgg_out = pickle.load(open(".pkl/vgg_out.pkl", "rb"))

# Results to compare the methods the number of component changes
results_PCA = []
results_LDA = []

PCAs_instances = {}
LDAs_instances = {}
TSNEs_instances = {}

PCAs_results = {
    "train": {},
    "valid": {},
}

LDAs_results = {
    "train": {},
    "valid": {},
}

TSNEs_results = {
    "train": {},
    "valid": {},
}

for n_components in n_components_to_test["PCA"]:
    print(f'[PCA] Extracting features (# components:{n_components})')
    
    PCAs_instances[n_components] = []

    PCAs_results["train"][n_components] = []
    PCAs_results["valid"][n_components] = []

    PCA_instance = PCA(n_components=n_components)

    PCA_instance.fit(vgg_out["train"])

    PCAs_results["train"][n_components] = PCA_instance.transform(vgg_out["train"])
    PCAs_results["valid"][n_components] = PCA_instance.transform(vgg_out["valid"])

    PCAs_instances[n_components] = PCA_instance
    
    results_PCA.append(
        {
            "METHOD": "PCA",
            "# Components": n_components,
            "CHANNEL": "RGB",
            "Explained Variance Ratio": np.sum(
                PCA_instance.explained_variance_ratio_, axis=0
            ),
        }
    )

for n_components in n_components_to_test["LDA"]:
    print(f'[LDA] Extracting features (# components:{n_components})')
    
    LDAs_instances[n_components] = []

    LDAs_results["train"][n_components] = []
    LDAs_results["valid"][n_components] = []

    LDA_instance = LinearDiscriminantAnalysis(n_components=n_components)

    LDA_instance.fit(vgg_out["train"], dataset["train"]["labels"])

    LDAs_results["train"][n_components] = LDA_instance.transform(vgg_out["train"])
    LDAs_results["valid"][n_components] = LDA_instance.transform(vgg_out["valid"])

    LDAs_instances[n_components] = LDA_instance

    results_LDA.append(
        {
            "METHOD": "LDA",
            "# Components": n_components,
            "CHANNEL": "RGB",
            "Explained Variance Ratio": np.sum(
                LDA_instance.explained_variance_ratio_, axis=0
            ),
        }
    )

for n_components in n_components_to_test["TSNE"]:
    print(f'[t-SNE] Extracting features (# components:{n_components})')

    TSNEs_instances[n_components] = []

    TSNEs_results["train"][n_components] = []

    TSNE_instance_train = TSNE(n_components=n_components, verbose=1, n_iter=3000)
    TSNE_instance_valid = TSNE(n_components=n_components, verbose=1, n_iter=3000)

    TSNEs_results["train"][n_components] = TSNE_instance_train.fit_transform(
        LDAs_results["train"][7]
    )
    TSNEs_results["valid"][n_components] = TSNE_instance_valid.fit_transform(
        LDAs_results["valid"][7]
    )

    TSNEs_instances[n_components] = [TSNE_instance_train, TSNE_instance_valid]


# Pandas tables
df_results_PCA = pd.DataFrame(results_PCA)
df_results_LDA = pd.DataFrame(results_LDA)

def highlight_cells(val):
    color = ""
    if val > 0.80:
        color = "background-color: lightgreen; color: black; font-weight: bold"
    elif val < 0.80:
        color = "background-color: lightcoral; color: black; font-weight: bold"
    return color

# Apply the style
df_results_PCA_styled = (
    df_results_PCA.style.map(highlight_cells, subset=["Explained Variance Ratio"])
    .set_caption("PCA Results")
    .set_properties(**{"text-align": "center"})
)

df_results_LDA_styled = (
    df_results_LDA.style.map(highlight_cells, subset=["Explained Variance Ratio"])
    .set_caption("LDA Results")
    .set_properties(**{"text-align": "center"})
)


pickle.dump(PCAs_results, open(".pkl/vgg_pca_out.pkl", "wb"))
pickle.dump(LDAs_results, open(".pkl/vgg_lda_out.pkl", "wb"))
pickle.dump(TSNEs_results, open(".pkl/vgg_tsne_out.pkl", "wb"))

display(df_results_PCA_styled)
display(df_results_LDA_styled)

[PCA] Extracting features (# components:3)
[0.02996423 0.02048971 0.01742603]
[PCA] Extracting features (# components:10)
[0.02996422 0.02048968 0.01742605 0.01123542 0.01054162 0.00948114
 0.008656   0.00766105 0.00714866 0.00691733]
[PCA] Extracting features (# components:50)
[0.02996423 0.02048969 0.01742603 0.01123543 0.01054164 0.00948118
 0.00865612 0.0076617  0.00715026 0.00691878 0.00634067 0.00612115
 0.00539468 0.00539097 0.00501659 0.00472771 0.00463642 0.00459523
 0.0043562  0.00424079 0.00406343 0.00397978 0.00390117 0.00379479
 0.0036678  0.00363544 0.00348592 0.0034434  0.00332897 0.00326923
 0.00320907 0.00315311 0.00304729 0.00299434 0.00296574 0.00292028
 0.00287376 0.00284058 0.00279697 0.00271531 0.00264048 0.00260975
 0.00259773 0.00250833 0.00248171 0.00246514 0.00241889 0.00240694
 0.0023717  0.00231239]
[PCA] Extracting features (# components:100)
[0.02996419 0.02048968 0.01742603 0.01123543 0.01054163 0.00948117
 0.00865612 0.00766171 0.00715025 0.00691877 0.00

Unnamed: 0,METHOD,# Components,CHANNEL,Explained Variance Ratio
0,PCA,3,RGB,0.06788
1,PCA,10,RGB,0.129521
2,PCA,50,RGB,0.271245
3,PCA,100,RGB,0.365943
4,PCA,200,RGB,0.48813
5,PCA,500,RGB,0.709697
6,PCA,1200,RGB,0.954115


Unnamed: 0,METHOD,# Components,CHANNEL,Explained Variance Ratio
0,LDA,3,RGB,0.636011
1,LDA,5,RGB,0.79597
2,LDA,7,RGB,0.909774
3,LDA,9,RGB,1.0


### 2D/3D Data visualization using t-SNE

In [4]:
vgg_out = pickle.load(open(".pkl/vgg_out.pkl", "rb"))
 
PCAs_results = pickle.load(open(".pkl/vgg_pca_out.pkl", "rb"))
LDAs_results = pickle.load(open(".pkl/vgg_lda_out.pkl", "rb"))
TSNEs_results = pickle.load(open(".pkl/vgg_tsne_out.pkl", "rb"))

# 2D plot
plt.ion()
plt.show()

tSNE_fig_2D = plt.figure()
tSNE_3D = tSNE_fig_2D.add_subplot()

for i in range(len(dataset["train"]["unique_labels"])):
    classIdxs = dataset["train"]["labels"] == i

    tsne_features = TSNEs_results["train"][2][classIdxs, :]

    tSNE_3D.set_label(dataset["train"]["unique_labels"][i])
    tSNE_3D.scatter(
        tsne_features[:, 0],
        tsne_features[:, 1],
        marker=".",
        label=dataset["train"]["unique_labels"][i],
    )

plt.legend(loc="upper left")

# 3D plot
tSNE_fig_3D = plt.figure()
tSNE_3D = tSNE_fig_3D.add_subplot(projection="3d")

for i in range(len(dataset["train"]["unique_labels"])):
    classIdxs = dataset["train"]["labels"] == i

    tsne_features = TSNEs_results["train"][3][classIdxs, :]

    tSNE_3D.scatter(
        tsne_features[:, 0],
        tsne_features[:, 1],
        tsne_features[:, 2],
        marker=".",
        label=dataset["train"]["unique_labels"][i],
    )

plt.legend(loc="upper left")

<matplotlib.legend.Legend at 0x7ec8c20f7e50>

### Classification - KNN

In [9]:
vgg_out = pickle.load(open(".pkl/vgg_out.pkl", "rb"))
 
PCAs_results = pickle.load(open(".pkl/vgg_pca_out.pkl", "rb"))
LDAs_results = pickle.load(open(".pkl/vgg_lda_out.pkl", "rb"))
TSNEs_results = pickle.load(open(".pkl/vgg_tsne_out.pkl", "rb"))

# Number of neighbors to test
k_to_test = {
    "VGG": [3, 5, 9, 15, 21, 55, 111, 251],
    "PCA": [3, 5, 9, 15, 21, 55, 111, 251],
    "LDA": [3, 5, 9, 15, 21, 55, 111, 251],
}

KNN_VGG_stats = []
KNN_PCA_stats = []
KNN_LDA_stats = []

for k_idx, k in enumerate(k_to_test["VGG"]):

    KNN_VGG_stats.insert(k_idx, [k])

    knn = OneVsOneClassifier(KNeighborsClassifier(k))

    knn.fit(vgg_out["train"], dataset["train"]["labels"])
    preds = knn.predict(vgg_out["valid"])

    accuracy = round(accuracy_score(dataset["valid"]["labels"], preds), 3)
    precision = round(
        # f1_score(dataset["valid"]["labels"], preds, average="weighted")
        # precision_score(dataset["valid"]["labels"], preds, average="weighted", labels=np.unique(preds)), 3
    )

    KNN_VGG_stats[k_idx].append((accuracy, precision))

    # ConfusionMatrixDisplay(confusion_matrix(dataset['valid']['labels'], preds),display_labels=dataset['valid']['unique_labels']).plot()

KNN_VGG_df = pd.DataFrame(KNN_VGG_stats, columns=["k\\VGG", ""])
display(KNN_VGG_df)

for k_idx, k in enumerate(k_to_test["PCA"]):

    KNN_PCA_stats.insert(k_idx, [k])

    for n_components_idx, n_components in enumerate(n_components_to_test["PCA"]):
        knn = OneVsOneClassifier(KNeighborsClassifier(k))

        knn.fit(PCAs_results["train"][n_components], dataset["train"]["labels"])
        preds = knn.predict(PCAs_results["valid"][n_components])

        accuracy = round(accuracy_score(dataset["valid"]["labels"], preds), 3)
        precision = round(
            precision_score(dataset["valid"]["labels"], preds, average="weighted", labels=np.unique(preds)), 3
        )

        KNN_PCA_stats[k_idx].insert(n_components_idx + 1, (accuracy, precision))

        # ConfusionMatrixDisplay(confusion_matrix(dataset['valid']['labels'], preds),display_labels=dataset['valid']['unique_labels']).plot()

KNN_PCA_df = pd.DataFrame(
    KNN_PCA_stats, columns=["k\\PCA components"] + n_components_to_test["PCA"]
)
display(KNN_PCA_df)

for k_idx, k in enumerate(k_to_test["LDA"]):

    KNN_LDA_stats.insert(k_idx, [k])

    for n_components_idx, n_components in enumerate(n_components_to_test["LDA"]):
        knn = OneVsOneClassifier(KNeighborsClassifier(k))

        knn.fit(LDAs_results["train"][n_components], dataset["train"]["labels"])
        preds = knn.predict(LDAs_results["valid"][n_components])

        accuracy = round(accuracy_score(dataset["valid"]["labels"], preds), 3)
        precision = round(
            precision_score(dataset["valid"]["labels"], preds, average="weighted"), 3
        )

        KNN_LDA_stats[k_idx].insert(n_components_idx + 1, (accuracy, precision))
        # ConfusionMatrixDisplay(confusion_matrix(dataset['valid']['labels'], preds),display_labels=dataset['valid']['unique_labels']).plot()


KNN_LDA_df = pd.DataFrame(
    KNN_LDA_stats, columns=["k\\LDA components"] + n_components_to_test["LDA"]
)
display(KNN_LDA_df)


Unnamed: 0,k\VGG,Unnamed: 2
0,3,"(0.346, 0.616)"
1,5,"(0.366, 0.592)"
2,9,"(0.362, 0.574)"
3,15,"(0.354, 0.63)"
4,21,"(0.324, 0.648)"
5,55,"(0.268, 0.79)"
6,111,"(0.242, 0.706)"
7,251,"(0.242, 0.767)"


Unnamed: 0,k\PCA components,3,10,50,100,200,500,1200
0,3,"(0.352, 0.347)","(0.544, 0.552)","(0.676, 0.69)","(0.648, 0.665)","(0.582, 0.631)","(0.5, 0.614)","(0.34, 0.527)"
1,5,"(0.4, 0.401)","(0.572, 0.584)","(0.69, 0.707)","(0.668, 0.708)","(0.602, 0.679)","(0.472, 0.631)","(0.328, 0.52)"
2,9,"(0.402, 0.398)","(0.586, 0.596)","(0.688, 0.737)","(0.642, 0.719)","(0.588, 0.691)","(0.458, 0.651)","(0.306, 0.717)"
3,15,"(0.368, 0.357)","(0.596, 0.608)","(0.698, 0.754)","(0.624, 0.725)","(0.526, 0.693)","(0.394, 0.677)","(0.282, 0.719)"
4,21,"(0.364, 0.359)","(0.598, 0.61)","(0.67, 0.737)","(0.618, 0.739)","(0.5, 0.714)","(0.37, 0.754)","(0.254, 0.774)"
5,55,"(0.378, 0.371)","(0.566, 0.583)","(0.606, 0.745)","(0.496, 0.707)","(0.398, 0.713)","(0.262, 0.757)","(0.212, 0.817)"
6,111,"(0.352, 0.357)","(0.512, 0.561)","(0.516, 0.71)","(0.438, 0.695)","(0.342, 0.736)","(0.246, 0.768)","(0.232, 0.643)"
7,251,"(0.328, 0.311)","(0.392, 0.503)","(0.4, 0.661)","(0.356, 0.535)","(0.358, 0.517)","(0.304, 0.544)","(0.316, 0.59)"


Unnamed: 0,k\LDA components,3,5,7,9
0,3,"(0.514, 0.55)","(0.626, 0.655)","(0.726, 0.73)","(0.748, 0.749)"
1,5,"(0.536, 0.574)","(0.648, 0.681)","(0.73, 0.736)","(0.754, 0.756)"
2,9,"(0.532, 0.568)","(0.678, 0.709)","(0.754, 0.762)","(0.784, 0.787)"
3,15,"(0.526, 0.563)","(0.67, 0.712)","(0.76, 0.77)","(0.78, 0.786)"
4,21,"(0.536, 0.582)","(0.684, 0.726)","(0.754, 0.767)","(0.778, 0.783)"
5,55,"(0.52, 0.579)","(0.672, 0.726)","(0.752, 0.769)","(0.806, 0.81)"
6,111,"(0.52, 0.581)","(0.65, 0.71)","(0.752, 0.772)","(0.792, 0.796)"
7,251,"(0.5, 0.569)","(0.604, 0.667)","(0.734, 0.773)","(0.778, 0.794)"


### Classification - SVM

In [13]:
kernels_to_test = {
    "VGG": ["linear", "poly", "sigmoid"], 
    "PCA": ["linear", "poly", "sigmoid"], 
    "LDA": ["linear", "poly", "sigmoid"], 
}

SVM_VGG_stats = []
SVM_PCA_stats = []
SVM_LDA_stats = []

for kernel_idx, kernel in enumerate(kernels_to_test["PCA"]):

    SVM_PCA_stats.insert(kernel_idx, [kernel])

    for n_components_idx, n_components in enumerate(n_components_to_test["PCA"]):
        svm = OneVsOneClassifier(SVC(kernel=kernel))

        svm.fit(PCAs_results["train"][n_components], dataset["train"]["labels"])

        preds = svm.predict(PCAs_results["valid"][n_components])

        accuracy = round(accuracy_score(dataset["valid"]["labels"], preds), 3)
        precision = round(
            precision_score(dataset["valid"]["labels"], preds, average="weighted"), 3
        )

        SVM_PCA_stats[kernel_idx].insert(n_components_idx + 1, (accuracy, precision))

SVM_PCA_df = pd.DataFrame(
    SVM_PCA_stats, columns=["kernel\\PCA components"] + n_components_to_test["PCA"]
)
display(SVM_PCA_df)

for kernel_idx, kernel in enumerate(kernels_to_test["LDA"]):

    SVM_LDA_stats.insert(kernel_idx, [kernel])

    for n_components_idx, n_components in enumerate(n_components_to_test["LDA"]):
        svm = OneVsOneClassifier(SVC(kernel=kernel))

        svm.fit(LDAs_results["train"][n_components], dataset["train"]["labels"])

        preds = svm.predict(LDAs_results["valid"][n_components])

        accuracy = round(accuracy_score(dataset["valid"]["labels"], preds), 3)
        precision = round(
            precision_score(dataset["valid"]["labels"], preds, average="weighted"), 3
        )

        SVM_LDA_stats[kernel_idx].insert(n_components_idx + 1, (accuracy, precision))

SVM_LDA_df = pd.DataFrame(
    SVM_LDA_stats, columns=["kernel\\LDA components"] + n_components_to_test["LDA"]
)
display(SVM_LDA_df)


for kernel_idx, kernel in enumerate(kernels_to_test["VGG"]):

    SVM_VGG_stats.insert(kernel_idx, [kernel])

    svm = OneVsOneClassifier(SVC(kernel=kernel))

    svm.fit(vgg_out["train"], dataset["train"]["labels"])

    preds = svm.predict(vgg_out["valid"])

    accuracy = round(accuracy_score(dataset["valid"]["labels"], preds), 3)
    precision = round(
        precision_score(dataset["valid"]["labels"], preds, average="weighted"), 3
    )

    SVM_VGG_stats[kernel_idx].append((accuracy, precision))

SVM_VGG_df = pd.DataFrame(
    SVM_VGG_stats, columns=["kernel\\VGG components", ""]
)
display(SVM_VGG_df)


  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,kernel\PCA components,3,10,50,100,200,500,1200
0,linear,"(0.386, 0.373)","(0.622, 0.623)","(0.73, 0.742)","(0.73, 0.731)","(0.776, 0.777)","(0.794, 0.799)","(0.806, 0.811)"
1,poly,"(0.378, 0.417)","(0.598, 0.645)","(0.624, 0.774)","(0.544, 0.771)","(0.396, 0.785)","(0.236, 0.872)","(0.218, 0.667)"
2,sigmoid,"(0.306, 0.305)","(0.548, 0.54)","(0.78, 0.785)","(0.796, 0.8)","(0.804, 0.81)","(0.832, 0.84)","(0.836, 0.842)"


Unnamed: 0,kernel\LDA components,3,5,7,9
0,linear,"(0.542, 0.576)","(0.664, 0.693)","(0.758, 0.776)","(0.774, 0.78)"
1,poly,"(0.498, 0.602)","(0.572, 0.755)","(0.646, 0.78)","(0.672, 0.788)"
2,sigmoid,"(0.41, 0.393)","(0.592, 0.583)","(0.694, 0.7)","(0.738, 0.745)"


Unnamed: 0,kernel\VGG components,Unnamed: 2
0,linear,"(0.802, 0.806)"
1,poly,"(0.53, 0.739)"
2,sigmoid,"(0.832, 0.837)"


### Classification - GMM

In [10]:
gmm_n_components_to_test = {
    "VGG": [15, 251],
    "PCA": [15, 251],
    "LDA": [15, 251],
}

GMM_VGG_stats = []
GMM_PCA_stats = []
GMM_LDA_stats = []

# for gmm_components_idx, gmm_n_components in enumerate(gmm_n_components_to_test["VGG"]):

#     GMM_VGG_stats.insert(gmm_components_idx, [gmm_n_components])

#     GMM = OneVsOneClassifier(GaussianMixture(n_components=gmm_n_components))

#     GMM.fit(vgg_out["train"], dataset["train"]["labels"])
#     preds = GMM.predict(vgg_out["valid"])

#     accuracy = round(accuracy_score(dataset["valid"]["labels"], preds), 3)
#     precision = round(
#         precision_score(dataset["valid"]["labels"], preds, average="weighted", labels=np.unique(preds)), 3
#     )

#     GMM_VGG_stats[gmm_components_idx].append((accuracy, precision))

# GMM_VGG_df = pd.DataFrame(GMM_VGG_stats, columns=["components\\VGG", ""])
# display(GMM_VGG_df)

for gmm_components_idx, gmm_n_components in enumerate(gmm_n_components_to_test["PCA"]):

    GMM_PCA_stats.insert(gmm_components_idx, [gmm_n_components])

    for n_components_idx, n_components in enumerate([ 500 ]):#enumerate(n_components_to_test["PCA"]):
        GMM = GaussianMixture(n_components=gmm_n_components)

        GMM.fit(PCAs_results["train"][n_components], dataset["train"]["labels"])
        preds = GMM.predict(PCAs_results["valid"][n_components])

        accuracy = round(accuracy_score(dataset["valid"]["labels"], preds), 3)
        precision = round(
            precision_score(dataset["valid"]["labels"], preds, average="weighted", labels=np.unique(preds)), 3
        )

        GMM_PCA_stats[gmm_components_idx].insert(n_components_idx + 1, (accuracy, precision))

GMM_PCA_df = pd.DataFrame(
    GMM_PCA_stats, columns=["components\\PCA components"] + [ 500 ]#n_components_to_test["PCA"]
)
display(GMM_PCA_df)

for gmm_components_idx, gmm_n_components in enumerate([7]):#enumerate(gmm_n_components_to_test["LDA"]):

    GMM_LDA_stats.insert(gmm_components_idx, [gmm_n_components])

    for n_components_idx, n_components in enumerate(n_components_to_test["LDA"]):
        GMM = GaussianMixture(n_components=gmm_n_components)

        GMM.fit(LDAs_results["train"][n_components], dataset["train"]["labels"])
        preds = GMM.predict(LDAs_results["valid"][n_components])

        accuracy = round(accuracy_score(dataset["valid"]["labels"], preds), 3)
        precision = round(
            precision_score(dataset["valid"]["labels"], preds, average="weighted"), 3
        )

        GMM_LDA_stats[gmm_components_idx].insert(n_components_idx + 1, (accuracy, precision))


GMM_LDA_df = pd.DataFrame(
    GMM_LDA_stats, columns=["components\\LDA components"] + [ 7 ] # n_components_to_test["LDA"]
)
display(GMM_LDA_df)

Unnamed: 0,components\PCA components,500
0,15,"(0.106, 0.059)"
1,251,"(0.002, 1.0)"


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


ValueError: 2 columns passed, passed data had 5 columns

### Classification - SGD

In [17]:

losses_to_test = {
    "VGG": ["modified_huber", "log_loss", "hinge"],
    "PCA": ["modified_huber", "log_loss", "hinge"],
    "LDA": ["modified_huber", "log_loss", "hinge"]
}

SGD_VGG_stats = []
SGD_PCA_stats = []
SGD_LDA_stats = []

for loss_idx, loss in enumerate(losses_to_test["VGG"]):

    SGD_VGG_stats.insert(loss_idx, [loss])

    svm = SGDClassifier(loss=loss, max_iter=10000)

    svm.fit(vgg_out["train"], dataset["train"]["labels"])

    preds = svm.predict(vgg_out["valid"])

    accuracy = round(accuracy_score(dataset["valid"]["labels"], preds), 3)
    precision = round(
        precision_score(dataset["valid"]["labels"], preds, average="weighted"), 3
    )

    SGD_VGG_stats[loss_idx].append((accuracy, precision))

SVM_VGG_df = pd.DataFrame(
    SGD_VGG_stats, columns=["loss\\VGG", ""]
)
display(SVM_VGG_df)

for loss_idx, loss in enumerate(losses_to_test["PCA"]):

    SGD_PCA_stats.insert(loss_idx, [loss])

    for n_components_idx, n_components in enumerate(n_components_to_test["PCA"]):
        sgd = SGDClassifier(loss=loss, max_iter=10000)

        sgd.fit(PCAs_results["train"][n_components], dataset["train"]["labels"])

        preds = sgd.predict(PCAs_results["valid"][n_components])

        accuracy = round(accuracy_score(dataset["valid"]["labels"], preds), 3)
        precision = round(
            precision_score(dataset["valid"]["labels"], preds, average="macro"), 3
        )

        SGD_PCA_stats[loss_idx].insert(
            n_components_idx + 1, (accuracy, precision)
        )
        # ConfusionMatrixDisplay(confusion_matrix(dataset['valid']['labels'], preds),display_labels=dataset['valid']['unique_labels']).plot()

SGD_PCA_df = pd.DataFrame(
    SGD_PCA_stats, columns=["loss\\PCA"] + n_components_to_test["PCA"]
)
display(SGD_PCA_df)

for loss_idx, loss in enumerate(losses_to_test["LDA"]):

    SGD_LDA_stats.insert(loss_idx, [loss])

    for n_components_idx, n_components in enumerate(n_components_to_test["LDA"]):
        sgd = SGDClassifier(loss=loss, max_iter=10000)

        sgd.fit(LDAs_results["train"][n_components], dataset["train"]["labels"])

        preds = sgd.predict(LDAs_results["valid"][n_components])

        accuracy = round(accuracy_score(dataset["valid"]["labels"], preds), 3)
        precision = round(
            precision_score(dataset["valid"]["labels"], preds, average="macro"), 3
        )

        SGD_LDA_stats[loss_idx].insert(
            n_components_idx + 1, (accuracy, precision)
        )

SGD_LDA_df = pd.DataFrame(
    SGD_LDA_stats, columns=["loss\\LDA"] + n_components_to_test["LDA"]
)
display(SGD_LDA_df)

Unnamed: 0,loss\VGG,Unnamed: 2
0,modified_huber,"(0.802, 0.806)"
1,log_loss,"(0.786, 0.793)"
2,hinge,"(0.814, 0.822)"


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,loss\PCA,3,10,50,100,200,500,1200
0,modified_huber,"(0.29, 0.298)","(0.598, 0.592)","(0.744, 0.747)","(0.75, 0.746)","(0.748, 0.748)","(0.772, 0.773)","(0.806, 0.808)"
1,log_loss,"(0.308, 0.381)","(0.582, 0.63)","(0.752, 0.753)","(0.74, 0.737)","(0.75, 0.75)","(0.78, 0.784)","(0.788, 0.795)"
2,hinge,"(0.312, 0.376)","(0.574, 0.557)","(0.734, 0.736)","(0.75, 0.749)","(0.74, 0.739)","(0.768, 0.776)","(0.796, 0.798)"


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,loss\LDA,3,5,7,9
0,modified_huber,"(0.422, 0.441)","(0.522, 0.488)","(0.648, 0.706)","(0.72, 0.737)"
1,log_loss,"(0.516, 0.635)","(0.64, 0.679)","(0.72, 0.736)","(0.754, 0.775)"
2,hinge,"(0.48, 0.538)","(0.598, 0.686)","(0.68, 0.693)","(0.722, 0.749)"
