# Models metrics comparison

In [1]:
import os
import sys
import json

sys.path.insert(0, '../src')

import numpy as np
import pandas as pd

from loader import ImagesDataset

In [2]:
SEED = 11
rng = np.random.default_rng(SEED)

In [3]:
DATA_PATH = '../images'
SHAPE = (128, 128, 3)

dataset = ImagesDataset(path=DATA_PATH, preload=False, encode_labels=True)
classes = dataset.label_encoder.classes_.tolist()
n_classes = len(classes)

dataset.split(shuffle=SEED)

train_uniques, train_counts = np.unique(dataset.labels[dataset.train.indices], return_counts=True)
train_shares = train_counts / train_counts.sum()

100%|[38;2;76;175;80m████████████████████████████████████████████[0m| 9/9 [00:00<00:00, 461.35it/s][0m


In [4]:
classwise_metrics = ['precision', 'recall']
general_metrics = ['accuracy', 'f1_score']
metrics_names = classwise_metrics + general_metrics

metrics = {name: {} for name in metrics_names}

METRICS_PATH = '../metrics'
for file in sorted(os.listdir(METRICS_PATH)):
    if not file.endswith('.json'):
        continue

    file_path = os.path.join(METRICS_PATH, file)
    with open(file_path, 'r') as f:
        metrics_dict = json.load(f)
    
    model_name = file.split('.')[0].split('_')[1]
    for name in metrics_names:
        metrics[name][model_name] = metrics_dict[name]

In [6]:
html = ''

styles = [
    {
        'selector': "caption",
        'props': [
            ('font-family', 'monospace'),
            ("font-size", "150%")
        ]
    },
    {
        'selector': "tbody",
        'props': [
            ('font-family', 'monospace'),
            ('text-align', 'right'),
        ]
    },
]

for metrics_name in metrics_names:
    metrics_dict = metrics[metrics_name]

    if metrics_name in classwise_metrics:
        metrics_dict = {'shares': train_shares, **metrics_dict}
        index = classes
    else:
        metrics_dict = {'shares': 1 / n_classes, **metrics_dict}
        index = ['\u00A0' * 3 + 'altogether']

    df = pd.DataFrame(data=metrics_dict, index=index)
    formatter = lambda value: f"{(value * 100).round(1)}"
    styled_df = df.style.background_gradient('RdYlGn', vmin=0, vmax=1, axis=0).format(formatter).set_caption(metrics_name).set_table_styles(styles)
    display(styled_df)
    
    html += styled_df.render()
    
with open('../report/report.html', 'w') as f:
    f.write(html)

Unnamed: 0,shares,GUESS,LOGISTIC,VGG,EFFICIENTNET,KMeans,LGBM
ArtDeco,4.7,7.1,0.0,75.0,50.0,66.7,40.0
Cubism,25.7,30.0,34.2,64.1,71.1,72.5,71.2
Impressionism,17.0,15.7,40.0,51.7,63.8,57.1,54.5
Japonism,14.1,10.8,31.2,75.9,67.7,74.2,63.6
Naturalism,15.2,14.3,41.9,83.8,73.3,78.9,80.0
Rococo,8.6,9.1,20.0,52.0,52.8,48.3,65.0
cartoon,4.9,12.5,12.5,63.6,53.3,72.7,77.8
photo,9.7,13.6,22.6,79.2,81.8,66.7,71.9


Unnamed: 0,shares,GUESS,LOGISTIC,VGG,EFFICIENTNET,KMeans,LGBM
ArtDeco,4.7,8.3,0.0,50.0,30.0,50.0,50.0
Cubism,25.7,33.3,18.1,81.9,84.3,80.6,79.2
Impressionism,17.0,17.4,26.1,65.2,68.2,69.6,65.2
Japonism,14.1,9.5,35.7,52.4,60.0,54.8,50.0
Naturalism,15.2,13.3,40.0,68.9,62.3,66.7,71.1
Rococo,8.6,10.5,57.9,68.4,65.5,73.7,68.4
cartoon,4.9,10.5,15.8,36.8,50.0,42.1,36.8
photo,9.7,10.3,24.1,65.5,64.3,69.0,79.3


Unnamed: 0,shares,GUESS,LOGISTIC,VGG,EFFICIENTNET,KMeans,LGBM
altogether,12.5,17.6,27.8,65.8,67.0,67.3,66.5


Unnamed: 0,shares,GUESS,LOGISTIC,VGG,EFFICIENTNET,KMeans,LGBM
altogether,12.5,14.2,26.2,64.5,62.3,65.2,64.0


### Conlusion

Clustering of embeddings via KMeans produces the highest metrics values so far, in both accuracy and F1 score. The latter is basically a harmonic mean of precision and recall, so with no extra assumptions about specific classes importance (e.g. errors on ArtDeco should be minimized at all costs).

🥇I conclude that VGG + KMeans combo is the best model among others.

🥈 The honorable second place goes to boosting via LGBM. 

🥉Finally, third place is shared among VGG and EfficientNet. What's interesting about VGG here is that it deals slightly better with rare classes, like cartoon and ArtDeco.

### Criticism

There are much more here to test and research. Among most important, yet not implemented ideas are:

- cross-validation: metrics above are obtained on a single fixed seed and therefore estimate of model performance quality based solely on them is kinda weak;
- augmentations: the dataset is small and even simple augmentations like flip, rotate, cutout, crop and noise might significantly enrich it and therefore, increase model generalizing capability and reduce overfitting;
- massive hyper-parameters research: those one, fixed in presented experiment are obtained via minimized research on limited parameters domain;
- algorithms: only a small subset of possible approaches is checked during this experiment — there are more architectures (i.e. with residual connections, attention modules) and clustering algorithms (i.e. TSNE, DBSCAN, SOM) that might perform better on given data
- find more data: there are open dataset containing images of various styles (e.g. ArtGan). they might contribute a lot;